In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, recall_score
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
import itertools
from torch.optim.lr_scheduler import ReduceLROnPlateau
In [2]:
from dataloader_creator import CreatorDL
creator = CreatorDL(seed=42, bs=2048)
In [3]:
df_UNSW = creator.reader("NF-UNSW-NB15-v3")

df_train_UNSW, df_test_UNSW, df_val_UNSW = creator.splitter(df_UNSW)

train_loader_UNSW, test_loader_UNSW, val_loader_UNSW = creator.balancer(df_train_UNSW, df_test_UNSW, df_val_UNSW)
Processando a categoria: 'Benign'
  -> Treino: 1118865 | Teste: 559433 | Validação: 559433
Processando a categoria: 'Fuzzers'
  -> Treino: 16908 | Teste: 8454 | Validação: 8454
Processando a categoria: 'Exploits'
  -> Treino: 21374 | Teste: 10687 | Validação: 10687
Processando a categoria: 'Backdoor'
  -> Treino: 2329 | Teste: 1165 | Validação: 1165
Processando a categoria: 'Reconnaissance'
  -> Treino: 8537 | Teste: 4268 | Validação: 4269
Processando a categoria: 'Generic'
  -> Treino: 9825 | Teste: 4913 | Validação: 4913
Processando a categoria: 'DoS'
  -> Treino: 2990 | Teste: 1495 | Validação: 1495
Processando a categoria: 'Shellcode'
  -> Treino: 1190 | Teste: 595 | Validação: 596
Processando a categoria: 'Analysis'
  -> Treino: 613 | Teste: 306 | Validação: 307
Processando a categoria: 'Worms'
  -> Treino: 79 | Teste: 39 | Validação: 40

--- Base de Treino ---
Tamanho: 1182710 linhas
Categorias presentes: ['Benign' 'Exploits' 'Reconnaissance' 'Fuzzers' 'DoS' 'Generic' 'Backdoor'
 'Shellcode' 'Analysis' 'Worms']
Attack
Benign            1118865
Exploits            21374
Fuzzers             16908
Generic              9825
Reconnaissance       8537
DoS                  2990
Backdoor             2329
Shellcode            1190
Analysis              613
Worms                  79
Name: count, dtype: int64
-------------------------

--- Base de Teste ---
Tamanho: 591355 linhas
Categorias presentes: ['Benign' 'Generic' 'DoS' 'Reconnaissance' 'Exploits' 'Fuzzers' 'Backdoor'
 'Shellcode' 'Analysis' 'Worms']
Attack
Benign            559433
Exploits           10687
Fuzzers             8454
Generic             4913
Reconnaissance      4268
DoS                 1495
Backdoor            1165
Shellcode            595
Analysis             306
Worms                 39
Name: count, dtype: int64
-------------------------

--- Base de Validação ---
Tamanho: 591359 linhas
Categorias presentes: ['Benign' 'Fuzzers' 'Reconnaissance' 'Exploits' 'Generic' 'Analysis'
 'Shellcode' 'Backdoor' 'DoS' 'Worms']
Attack
Benign            559433
Exploits           10687
Fuzzers             8454
Generic             4913
Reconnaissance      4269
DoS                 1495
Backdoor            1165
Shellcode            596
Analysis             307
Worms                 40
Name: count, dtype: int64
-------------------------

--- train ---
Label
1    9000
0    9000
Name: count, dtype: int64

Attack
Benign            9000
DoS               1000
Shellcode         1000
Generic           1000
Analysis          1000
Reconnaissance    1000
Fuzzers           1000
Worms             1000
Exploits          1000
Backdoor          1000
Name: count, dtype: int64

torch.Size([18000, 32])

(tensor([0, 1]), tensor([9000, 9000]))
tensor(0.) tensor(1.) tensor(0.0516)
-------------------------

--- test ---
Label
1    9000
0    9000
Name: count, dtype: int64

Attack
Benign            9000
DoS               1000
Shellcode         1000
Generic           1000
Analysis          1000
Reconnaissance    1000
Fuzzers           1000
Worms             1000
Exploits          1000
Backdoor          1000
Name: count, dtype: int64

torch.Size([18000, 32])

(tensor([0, 1]), tensor([9000, 9000]))
tensor(-1.4981e-07) tensor(4.5768) tensor(0.0510)
-------------------------

--- val ---
Label
1    9000
0    9000
Name: count, dtype: int64

Attack
Benign            9000
DoS               1000
Shellcode         1000
Generic           1000
Analysis          1000
Reconnaissance    1000
Fuzzers           1000
Worms             1000
Exploits          1000
Backdoor          1000
Name: count, dtype: int64

torch.Size([18000, 32])

(tensor([0, 1]), tensor([9000, 9000]))
tensor(-2.9962e-07) tensor(3.5191) tensor(0.0518)
In [4]:
df_BOT= creator.reader("NF-BoT-IoT-v3")

df_train_BOT, df_test_BOT, df_val_BOT = creator.splitter(df_BOT)

train_loader_BOT, test_loader_BOT, val_loader_BOT = creator.balancer(df_train_BOT, df_test_BOT, df_val_BOT)
Processando a categoria: 'Benign'
  -> Treino: 25994 | Teste: 12997 | Validação: 12998
Processando a categoria: 'DDoS'
  -> Treino: 3575441 | Teste: 1787720 | Validação: 1787721
Processando a categoria: 'DoS'
  -> Treino: 4017095 | Teste: 2008547 | Validação: 2008548
Processando a categoria: 'Reconnaissance'
  -> Treino: 847566 | Teste: 423783 | Validação: 423783
Processando a categoria: 'Theft'
  -> Treino: 807 | Teste: 404 | Validação: 404

--- Base de Treino ---
Tamanho: 8466903 linhas
Categorias presentes: ['DDoS' 'DoS' 'Reconnaissance' 'Benign' 'Theft']
Attack
DoS               4017095
DDoS              3575441
Reconnaissance     847566
Benign              25994
Theft                 807
Name: count, dtype: int64
-------------------------

--- Base de Teste ---
Tamanho: 4233451 linhas
Categorias presentes: ['DDoS' 'DoS' 'Reconnaissance' 'Benign' 'Theft']
Attack
DoS               2008547
DDoS              1787720
Reconnaissance     423783
Benign              12997
Theft                 404
Name: count, dtype: int64
-------------------------

--- Base de Validação ---
Tamanho: 4233454 linhas
Categorias presentes: ['DoS' 'DDoS' 'Reconnaissance' 'Benign' 'Theft']
Attack
DoS               2008548
DDoS              1787721
Reconnaissance     423783
Benign              12998
Theft                 404
Name: count, dtype: int64
-------------------------

--- train ---
Label
1    4000
0    4000
Name: count, dtype: int64

Attack
Benign            4000
Reconnaissance    1000
DoS               1000
Theft             1000
DDoS              1000
Name: count, dtype: int64

torch.Size([8000, 32])

(tensor([0, 1]), tensor([4000, 4000]))
tensor(0.) tensor(1.) tensor(0.0232)
-------------------------

--- test ---
Label
1    4000
0    4000
Name: count, dtype: int64

Attack
Benign            4000
Reconnaissance    1000
DoS               1000
Theft             1000
DDoS              1000
Name: count, dtype: int64

torch.Size([8000, 32])

(tensor([0, 1]), tensor([4000, 4000]))
tensor(-1.1910e-07) tensor(1.4751) tensor(0.0235)
-------------------------

--- val ---
Label
1    4000
0    4000
Name: count, dtype: int64

Attack
Benign            4000
Reconnaissance    1000
DoS               1000
Theft             1000
DDoS              1000
Name: count, dtype: int64

torch.Size([8000, 32])

(tensor([0, 1]), tensor([4000, 4000]))
tensor(-1.7865e-07) tensor(5.3125) tensor(0.0232)
In [5]:
df_CIC= creator.reader("NF-CICIDS2018-v3")

df_train_CIC, df_test_CIC, df_val_CIC = creator.splitter(df_CIC)

train_loader_CIC, test_loader_CIC, val_loader_CIC = creator.balancer(df_train_CIC, df_test_CIC, df_val_CIC)
Processando a categoria: 'Benign'
  -> Treino: 8757313 | Teste: 4378656 | Validação: 4378657
Processando a categoria: 'FTP-BruteForce'
  -> Treino: 193360 | Teste: 96680 | Validação: 96680
Processando a categoria: 'SSH-Bruteforce'
  -> Treino: 94237 | Teste: 47118 | Validação: 47119
Processando a categoria: 'DoS_attacks-GoldenEye'
  -> Treino: 30650 | Teste: 15325 | Validação: 15325
Processando a categoria: 'DoS_attacks-Slowloris'
  -> Treino: 18020 | Teste: 9010 | Validação: 9010
Processando a categoria: 'DoS_attacks-SlowHTTPTest'
  -> Treino: 52775 | Teste: 26387 | Validação: 26388
Processando a categoria: 'DoS_attacks-Hulk'
  -> Treino: 50038 | Teste: 25019 | Validação: 25019
Processando a categoria: 'DDoS_attacks-LOIC-HTTP'
  -> Treino: 144294 | Teste: 72147 | Validação: 72148
Processando a categoria: 'DDOS_attack-LOIC-UDP'
  -> Treino: 1725 | Teste: 862 | Validação: 863
Processando a categoria: 'DDOS_attack-HOIC'
  -> Treino: 516155 | Teste: 258078 | Validação: 258078
Processando a categoria: 'Brute_Force_-Web'
  -> Treino: 809 | Teste: 404 | Validação: 405
Processando a categoria: 'Brute_Force_-XSS'
  -> Treino: 240 | Teste: 120 | Validação: 120
Processando a categoria: 'SQL_Injection'
  -> Treino: 220 | Teste: 110 | Validação: 110
Processando a categoria: 'Infilteration'
  -> Treino: 94076 | Teste: 47038 | Validação: 47038
Processando a categoria: 'Bot'
  -> Treino: 103851 | Teste: 51926 | Validação: 51926

--- Base de Treino ---
Tamanho: 10057763 linhas
Categorias presentes: ['Benign' 'Infilteration' 'DDoS_attacks-LOIC-HTTP' 'DDOS_attack-HOIC'
 'FTP-BruteForce' 'DoS_attacks-Hulk' 'Bot' 'DoS_attacks-GoldenEye'
 'SSH-Bruteforce' 'DoS_attacks-SlowHTTPTest' 'DoS_attacks-Slowloris'
 'Brute_Force_-Web' 'DDOS_attack-LOIC-UDP' 'Brute_Force_-XSS'
 'SQL_Injection']
Attack
Benign                      8757313
DDOS_attack-HOIC             516155
FTP-BruteForce               193360
DDoS_attacks-LOIC-HTTP       144294
Bot                          103851
SSH-Bruteforce                94237
Infilteration                 94076
DoS_attacks-SlowHTTPTest      52775
DoS_attacks-Hulk              50038
DoS_attacks-GoldenEye         30650
DoS_attacks-Slowloris         18020
DDOS_attack-LOIC-UDP           1725
Brute_Force_-Web                809
Brute_Force_-XSS                240
SQL_Injection                   220
Name: count, dtype: int64
-------------------------

--- Base de Teste ---
Tamanho: 5028880 linhas
Categorias presentes: ['Benign' 'Infilteration' 'DDOS_attack-HOIC' 'FTP-BruteForce'
 'SSH-Bruteforce' 'DDoS_attacks-LOIC-HTTP' 'DDOS_attack-LOIC-UDP' 'Bot'
 'DoS_attacks-GoldenEye' 'DoS_attacks-SlowHTTPTest' 'DoS_attacks-Hulk'
 'DoS_attacks-Slowloris' 'Brute_Force_-Web' 'Brute_Force_-XSS'
 'SQL_Injection']
Attack
Benign                      4378656
DDOS_attack-HOIC             258078
FTP-BruteForce                96680
DDoS_attacks-LOIC-HTTP        72147
Bot                           51926
SSH-Bruteforce                47118
Infilteration                 47038
DoS_attacks-SlowHTTPTest      26387
DoS_attacks-Hulk              25019
DoS_attacks-GoldenEye         15325
DoS_attacks-Slowloris          9010
DDOS_attack-LOIC-UDP            862
Brute_Force_-Web                404
Brute_Force_-XSS                120
SQL_Injection                   110
Name: count, dtype: int64
-------------------------

--- Base de Validação ---
Tamanho: 5028886 linhas
Categorias presentes: ['Benign' 'FTP-BruteForce' 'DDoS_attacks-LOIC-HTTP' 'DDOS_attack-HOIC'
 'Bot' 'SSH-Bruteforce' 'DoS_attacks-SlowHTTPTest' 'DoS_attacks-Hulk'
 'Infilteration' 'DoS_attacks-GoldenEye' 'DoS_attacks-Slowloris'
 'DDOS_attack-LOIC-UDP' 'Brute_Force_-XSS' 'Brute_Force_-Web'
 'SQL_Injection']
Attack
Benign                      4378657
DDOS_attack-HOIC             258078
FTP-BruteForce                96680
DDoS_attacks-LOIC-HTTP        72148
Bot                           51926
SSH-Bruteforce                47119
Infilteration                 47038
DoS_attacks-SlowHTTPTest      26388
DoS_attacks-Hulk              25019
DoS_attacks-GoldenEye         15325
DoS_attacks-Slowloris          9010
DDOS_attack-LOIC-UDP            863
Brute_Force_-Web                405
Brute_Force_-XSS                120
SQL_Injection                   110
Name: count, dtype: int64
-------------------------

--- train ---
Label
0    14000
1    14000
Name: count, dtype: int64

Attack
Benign                      14000
DDoS_attacks-LOIC-HTTP       1000
Brute_Force_-Web             1000
FTP-BruteForce               1000
Infilteration                1000
SSH-Bruteforce               1000
DoS_attacks-GoldenEye        1000
DoS_attacks-SlowHTTPTest     1000
DoS_attacks-Slowloris        1000
DDOS_attack-LOIC-UDP         1000
DoS_attacks-Hulk             1000
SQL_Injection                1000
Bot                          1000
DDOS_attack-HOIC             1000
Brute_Force_-XSS             1000
Name: count, dtype: int64

torch.Size([28000, 32])

(tensor([0, 1]), tensor([14000, 14000]))
tensor(0.) tensor(1.) tensor(0.0473)
-------------------------

--- test ---
Label
0    14000
1    14000
Name: count, dtype: int64

Attack
Benign                      14000
DDoS_attacks-LOIC-HTTP       1000
Brute_Force_-Web             1000
FTP-BruteForce               1000
Infilteration                1000
SSH-Bruteforce               1000
DoS_attacks-GoldenEye        1000
DoS_attacks-SlowHTTPTest     1000
DoS_attacks-Slowloris        1000
DDOS_attack-LOIC-UDP         1000
DoS_attacks-Hulk             1000
SQL_Injection                1000
Bot                          1000
DDOS_attack-HOIC             1000
Brute_Force_-XSS             1000
Name: count, dtype: int64

torch.Size([28000, 32])

(tensor([0, 1]), tensor([14000, 14000]))
tensor(0.) tensor(1.4776) tensor(0.0477)
-------------------------

--- val ---
Label
0    14000
1    14000
Name: count, dtype: int64

Attack
Benign                      14000
DDoS_attacks-LOIC-HTTP       1000
Brute_Force_-Web             1000
FTP-BruteForce               1000
Infilteration                1000
SSH-Bruteforce               1000
DoS_attacks-GoldenEye        1000
DoS_attacks-SlowHTTPTest     1000
DoS_attacks-Slowloris        1000
DDOS_attack-LOIC-UDP         1000
DoS_attacks-Hulk             1000
SQL_Injection                1000
Bot                          1000
DDOS_attack-HOIC             1000
Brute_Force_-XSS             1000
Name: count, dtype: int64

torch.Size([28000, 32])

(tensor([0, 1]), tensor([14000, 14000]))
tensor(0.) tensor(2.7903) tensor(0.0478)
In [6]:
train_loaders = [train_loader_UNSW, train_loader_BOT, train_loader_CIC]
test_loaders = [test_loader_UNSW, test_loader_BOT, test_loader_CIC]
val_loaders = [val_loader_UNSW, val_loader_BOT, val_loader_CIC]
In [40]:
INPUT_DIM = 32

class IDSBranchyNet(nn.Module):
    def __init__(self, input_dim=INPUT_DIM, num_classes=2):
        super(IDSBranchyNet, self).__init__()
        
        self.shared_layers = nn.Sequential(
            nn.Linear(input_dim, input_dim * 2),
            nn.ReLU(),
        )
        
        self.exit1_layers = nn.Sequential(
            nn.Linear(input_dim * 2, num_classes)
        )
        
        self.exit2_layers = nn.Sequential(
            nn.Linear(input_dim * 2, 1024),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 2048),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(2048, 2048),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(2048, 1024),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, num_classes)
        )

    def forward_exit1(self, x):
        features = self.shared_layers(x)
        return self.exit1_layers(features)

    def forward_exit2(self, x):
        features = self.shared_layers(x)
        return self.exit2_layers(features)

model = IDSBranchyNet()
In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
Using device: cuda
In [42]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
import itertools
import matplotlib.pyplot as plt
import numpy as np

def train_model(model, train_loaders, val_loaders, epochs, lr, device, current_threshold, patience=15):
    print(f"\n[INIT] --- MODO DEBUG EXTREMO ATIVADO (CORRIGIDO) ---")
    print(f"[INIT] Device: {device} | LR: {lr} | Threshold: {current_threshold}")

    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.001, patience=7)
    
    model.to(device)
    criterion = nn.CrossEntropyLoss() 
    
    metrics = [
        'loss1_a', 'loss1_b', 'loss1_c', 'loss_ex1_avg',
        'loss2_a', 'loss2_b', 'loss2_c', 'loss_ex2_avg',
        'l_joint', 'total_loss'
    ]

    history = {
        'train': {k: [] for k in metrics},
        'val': {k: [] for k in metrics}
    }

    best_val_loss = float('inf')
    epochs_no_improve = 0
    best_model_state = None

    max_train_batches = max(len(l) for l in train_loaders) 
    train_iter_loaders = [itertools.cycle(l) if len(l) < max_train_batches else l for l in train_loaders]
    
    max_val_batches = max(len(l) for l in val_loaders)
    val_iter_loaders = [itertools.cycle(l) if len(l) < max_val_batches else l for l in val_loaders]

    for epoch in range(epochs):
        print(f"\n{'#'*30} EPOCH {epoch+1}/{epochs} START {'#'*30}")
        model.train()
        
        running_metrics = {k: 0.0 for k in metrics}
        total_steps = 0

        loader_iterators = [iter(l) for l in train_iter_loaders]
                
        for batch_idx in range(max_train_batches):
            print(f"\n>>> [TRAIN] BATCH {batch_idx} START <<<")
            
            try:
                batches = [next(it) for it in loader_iterators]
            except StopIteration:
                print("[DEBUG] StopIteration atingido.")
                break
            
            optimizer.zero_grad()

            (inputs_a, labels_a) = batches[0]
            (inputs_b, labels_b) = batches[1]
            (inputs_c, labels_c) = batches[2]
            
            inputs_a, labels_a = inputs_a.to(device), labels_a.to(device)
            inputs_b, labels_b = inputs_b.to(device), labels_b.to(device)
            inputs_c, labels_c = inputs_c.to(device), labels_c.to(device)
            
            # --- DEBUG DE DADOS DE ENTRADA ---
            print(f"[DATA A] Shape: {inputs_a.shape} | Mean: {inputs_a.mean():.3f} | Std: {inputs_a.std():.3f} | Min: {inputs_a.min():.3f} | Max: {inputs_a.max():.3f}")
            
            # --- FORWARD EXIT 1 ---
            out1_a = model.forward_exit1(inputs_a)
            out1_b = model.forward_exit1(inputs_b)
            out1_c = model.forward_exit1(inputs_c)

            # --- DEBUG LOGITS ---
            print(f"[LOGITS Ex1 A] Mean Abs: {out1_a.abs().mean():.3f} | Max: {out1_a.max():.3f}")

            probs_a = F.softmax(out1_a, dim=1)
            conf_a, _ = torch.max(probs_a, dim=1)

            probs_b = F.softmax(out1_b, dim=1)
            conf_b, _ = torch.max(probs_b, dim=1)

            probs_c = F.softmax(out1_c, dim=1)
            conf_c, _ = torch.max(probs_c, dim=1)
            
            # --- DEBUG PROBABILIDADES DETALHADAS (CORRIGIDO) ---
            # Determina o numero de classes dinamicamente para evitar erro se classes < 3
            num_classes = probs_a.size(1)
            k_val = min(3, num_classes) 
            
            top_k_prob, top_k_idx = torch.topk(probs_a[0], k_val)
            print(f"[SAMPLE 0 PREDICTION A] Top{k_val} Probs: {top_k_prob.detach().cpu().numpy()} | Indices: {top_k_idx.detach().cpu().numpy()} | Label Real: {labels_a[0].item()}")
            print(f"[CONFIDENCE A] Mean: {conf_a.mean().item():.3f} | Std: {conf_a.std().item():.3f}")

            mask_a_ex1 = conf_a > current_threshold
            mask_b_ex1 = conf_b > current_threshold
            mask_c_ex1 = conf_c > current_threshold

            mask_a_ex2 = conf_a <= current_threshold
            mask_b_ex2 = conf_b <= current_threshold
            mask_c_ex2 = conf_c <= current_threshold

            print(f"[MASKS] A(Pass/Fail): {mask_a_ex1.sum()}/{mask_a_ex2.sum()} | B: {mask_b_ex1.sum()}/{mask_b_ex2.sum()} | C: {mask_c_ex1.sum()}/{mask_c_ex2.sum()}")

            # --- LOSS EXIT 1 ---
            if mask_a_ex1.any():
                loss1_a = criterion(out1_a, labels_a)
            else:
                loss1_a = 0.0 * out1_a.sum()

            if mask_b_ex1.any():
                loss1_b = criterion(out1_b, labels_b)
            else:
                loss1_b = 0.0 * out1_b.sum()

            if mask_c_ex1.any():
                loss1_c = criterion(out1_c, labels_c)
            else:
                loss1_c = 0.0 * out1_c.sum()

            loss_ex1_avg = (loss1_a + loss1_b + loss1_c) / 3
            print(f"[LOSS Ex1] A: {loss1_a.item():.5f} | B: {loss1_b.item():.5f} | C: {loss1_c.item():.5f}")

            # --- FORWARD EXIT 2 ---
            out2_a = model.forward_exit2(inputs_a)
            out2_b = model.forward_exit2(inputs_b)
            out2_c = model.forward_exit2(inputs_c)
            
            # Debug Logits Ex2
            print(f"[LOGITS Ex2 A] Mean Abs: {out2_a.abs().mean():.3f} | Max: {out2_a.max():.3f}")
            
            if mask_a_ex2.any():
                loss2_a = criterion(out2_a, labels_a)
            else:
                loss2_a = 0.0 * out2_a.sum() 
            
            if mask_b_ex2.any():
                loss2_b = criterion(out2_b, labels_b)
            else:
                loss2_b = 0.0 * out2_b.sum()

            if mask_c_ex2.any():
                loss2_c = criterion(out2_c, labels_c)
            else:
                loss2_c = 0.0 * out2_c.sum()

            loss_ex2_avg = (loss2_a + loss2_b + loss2_c) / 3
            print(f"[LOSS Ex2] A: {loss2_a.item():.5f} | B: {loss2_b.item():.5f} | C: {loss2_c.item():.5f}")

            l_joint = loss_ex1_avg + loss_ex2_avg
            print(f"** [JOINT LOSS] ** : {l_joint.item():.6f}")

            if torch.isnan(l_joint):
                print("!!!!!!!!!! LOSS IS NAN !!!!!!!!!!")
                return current_threshold

            l_joint.backward()
            
            # --- DEBUG DE GRADIENTES POR CAMADA ---
            print(f"[GRADIENTS CHECK]")
            has_grads = False
            for name, param in model.named_parameters():
                if param.grad is not None:
                    grad_mean = param.grad.abs().mean().item()
                    grad_max = param.grad.abs().max().item()
                    print(f"  -> Layer: {name} | Grad Mean: {grad_mean:.6f} | Grad Max: {grad_max:.6f}")
                    has_grads = True
            
            if not has_grads:
                print("!!! NENHUM GRADIENTE ENCONTRADO EM TODO O MODELO !!!")

            total_norm = 0
            for p in model.parameters():
                if p.grad is not None:
                    param_norm = p.grad.data.norm(2)
                    total_norm += param_norm.item() ** 2
            total_norm = total_norm ** 0.5
            print(f"[GRADIENT NORM TOTAL] {total_norm:.4f}")

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            running_metrics['loss1_a'] += loss1_a.item()
            running_metrics['loss1_b'] += loss1_b.item()
            running_metrics['loss1_c'] += loss1_c.item()
            running_metrics['loss_ex1_avg'] += loss_ex1_avg.item()
            
            running_metrics['loss2_a'] += loss2_a.item()
            running_metrics['loss2_b'] += loss2_b.item()
            running_metrics['loss2_c'] += loss2_c.item()
            running_metrics['loss_ex2_avg'] += loss_ex2_avg.item()
            
            running_metrics['l_joint'] += l_joint.item()
            
            total_steps += 1

        for key in metrics:
            history['train'][key].append(running_metrics[key] / total_steps)
        
        epoch_train_loss = history['train']['l_joint'][-1]
        print(f"\n[EPOCH SUMMARY] Train Loss: {epoch_train_loss:.4f}")

        # --- VALIDATION ---
        print(f"\n[VALIDATION] Starting...")
        model.eval()
        running_metrics_val = {k: 0.0 for k in metrics}
        total_steps_val = 0
        
        val_loader_iterators = [iter(l) for l in val_iter_loaders]
        
        with torch.no_grad():
            for batch_val_idx in range(max_val_batches):
                try:
                    batches = [next(it) for it in val_loader_iterators]
                except StopIteration:
                    break
                
                if batch_val_idx == 0:
                    print("[VAL] Processando primeiro batch de validação...")

                (inputs_a, labels_a) = batches[0]
                (inputs_b, labels_b) = batches[1]
                (inputs_c, labels_c) = batches[2]
                    
                inputs_a, labels_a = inputs_a.to(device), labels_a.to(device)
                inputs_b, labels_b = inputs_b.to(device), labels_b.to(device)
                inputs_c, labels_c = inputs_c.to(device), labels_c.to(device)
    
                out1_a = model.forward_exit1(inputs_a)
                out1_b = model.forward_exit1(inputs_b)
                out1_c = model.forward_exit1(inputs_c)
    
                probs_a = F.softmax(out1_a, dim=1)
                conf_a, _ = torch.max(probs_a, dim=1)
    
                probs_b = F.softmax(out1_b, dim=1)
                conf_b, _ = torch.max(probs_b, dim=1)
    
                probs_c = F.softmax(out1_c, dim=1)
                conf_c, _ = torch.max(probs_c, dim=1)
    
                mask_a_ex1 = conf_a > current_threshold
                mask_b_ex1 = conf_b > current_threshold
                mask_c_ex1 = conf_c > current_threshold
    
                mask_a_ex2 = conf_a <= current_threshold
                mask_b_ex2 = conf_b <= current_threshold
                mask_c_ex2 = conf_c <= current_threshold
    
                if mask_a_ex1.any():
                    loss1_a = criterion(out1_a, labels_a)
                else:
                    loss1_a = 0.0 * out1_a.sum()
    
                if mask_b_ex1.any():
                    loss1_b = criterion(out1_b, labels_b)
                else:
                    loss1_b = 0.0 * out1_b.sum()
    
                if mask_c_ex1.any():
                    loss1_c = criterion(out1_c, labels_c)
                else:
                    loss1_c = 0.0 * out1_c.sum()
    
                loss_ex1_avg = (loss1_a + loss1_b + loss1_c) / 3
                
                out2_a = model.forward_exit2(inputs_a)
                out2_b = model.forward_exit2(inputs_b)
                out2_c = model.forward_exit2(inputs_c)
                    
                if mask_a_ex2.any():
                    loss2_a = criterion(out2_a, labels_a)
                else:
                    loss2_a = 0.0 * out2_a.sum() 
                
                if mask_b_ex2.any():
                    loss2_b = criterion(out2_b, labels_b)
                else:
                    loss2_b = 0.0 * out2_b.sum()
    
                if mask_c_ex2.any():
                    loss2_c = criterion(out2_c, labels_c)
                else:
                    loss2_c = 0.0 * out2_c.sum()
    
                loss_ex2_avg = (loss2_a + loss2_b + loss2_c) / 3
    
                l_joint = loss_ex1_avg + loss_ex2_avg
                                        
                running_metrics_val['loss1_a'] += loss1_a.item()
                running_metrics_val['loss1_b'] += loss1_b.item()
                running_metrics_val['loss1_c'] += loss1_c.item()
                running_metrics_val['loss_ex1_avg'] += loss_ex1_avg.item()
                
                running_metrics_val['loss2_a'] += loss2_a.item()
                running_metrics_val['loss2_b'] += loss2_b.item()
                running_metrics_val['loss2_c'] += loss2_c.item()
                running_metrics_val['loss_ex2_avg'] += loss_ex2_avg.item()
                
                running_metrics_val['l_joint'] += l_joint.item()
                
                total_steps_val += 1

        for key in metrics:
            history['val'][key].append(running_metrics_val[key] / total_steps_val)

        epoch_val_loss = history['val']['l_joint'][-1]
        
        thresh_print = current_threshold.item() if isinstance(current_threshold, torch.Tensor) else current_threshold
        print(f'[EPOCH END] Val Loss: {epoch_val_loss:.4f} | Alpha: {thresh_print:.4f}')
        
        if epoch_val_loss < best_val_loss:
            print(f"!!! BEST MODEL SAVED !!! (Old: {best_val_loss:.4f} -> New: {epoch_val_loss:.4f})")
            best_val_loss = epoch_val_loss
            epochs_no_improve = 0
            best_model_state = model.state_dict()
        else:
            epochs_no_improve += 1
            print(f"No improve count: {epochs_no_improve}/{patience}")
            if epochs_no_improve >= patience:
                print("EARLY STOPPING TRIGGERED")
                if best_model_state: model.load_state_dict(best_model_state)
                break
                
        scheduler.step(epoch_val_loss)
                
    epochs_range = range(1, len(history['train']['l_joint']) + 1)
    
    # Plotting (mantido igual)
    fig, axs = plt.subplots(1, 3, figsize=(20, 6))
    ax = axs[0]
    ax.set_title("Exit 1")
    ax.plot(epochs_range, history['train']['loss1_a'], label='Tr A', alpha=0.6)
    ax.plot(epochs_range, history['train']['loss1_b'], label='Tr B', alpha=0.6)
    ax.plot(epochs_range, history['train']['loss1_c'], label='Tr C', alpha=0.6)
    ax.plot(epochs_range, history['train']['loss_ex1_avg'], label='Tr Avg', linewidth=2)
    ax.plot(epochs_range, history['val']['loss1_a'], label='Val A', color='black', linestyle='--')
    ax.plot(epochs_range, history['val']['loss1_b'], label='Val B', color='black', linestyle='--')
    ax.plot(epochs_range, history['val']['loss1_c'], label='Val C', color='black', linestyle='--')
    ax.plot(epochs_range, history['val']['loss_ex1_avg'], label='Val Avg', color='black', linestyle='--', linewidth=2)
    ax.set_xlabel('Epochs')
    ax.set_ylabel('Loss')
    ax.legend()
    ax.grid(True)

    ax = axs[1]
    ax.set_title("Exit 2")
    ax.plot(epochs_range, history['train']['loss2_a'], label='Tr A', color='blue', alpha=0.6)
    ax.plot(epochs_range, history['train']['loss2_b'], label='Tr B', color='green', alpha=0.6)
    ax.plot(epochs_range, history['train']['loss2_c'], label='Tr C', color='red', alpha=0.6)
    ax.plot(epochs_range, history['train']['loss_ex2_avg'], label='Tr Avg', color='black', linewidth=2)
    ax.plot(epochs_range, history['val']['loss2_a'], label='Val A', color='black', linestyle='--')
    ax.plot(epochs_range, history['val']['loss2_b'], label='Val B', color='black', linestyle='--')
    ax.plot(epochs_range, history['val']['loss2_c'], label='Val C', color='black', linestyle='--')
    ax.plot(epochs_range, history['val']['loss_ex2_avg'], label='Val Avg', color='black', linestyle='--', linewidth=2)
    ax.set_xlabel('Epochs')
    ax.legend()
    ax.grid(True)

    ax = axs[2]
    ax.set_title("Global Optimization")
    ax.plot(epochs_range, history['train']['l_joint'], label='Tr Joint (Ex1 + Ex2)', color='purple')
    ax.plot(epochs_range, history['val']['l_joint'], label='Val Joint', color='purple', linestyle='--')
    ax.set_xlabel('Epochs')
    ax.legend()
    ax.grid(True)

    plt.tight_layout()
    plt.show()
    
    return current_threshold
In [43]:
def evaluate_model(model, loader, confidence_threshold, device):
    model.to(device)
    model.eval()

    total_samples = len(loader.dataset)
    all_predictions = []
    all_labels = []
    exited_early_count = 0
    total_inference_time = 0

    with torch.no_grad():
        for samples, labels in loader:
            samples, labels = samples.to(device), labels.to(device)

            torch.cuda.synchronize()
            start_time = time.perf_counter()
            
            branch_output = model.forward_exit1(samples)
            
            branch_prob = F.softmax(branch_output, dim=1)
            trusts, branch_preds = torch.max(branch_prob, 1)

            batch_predictions = torch.zeros_like(labels)
            
            early_exit_mask = trusts > confidence_threshold
            
            if early_exit_mask.any():
                batch_predictions[early_exit_mask] = branch_preds[early_exit_mask]
                exited_early_count += early_exit_mask.sum().item()

            main_branch_mask = ~early_exit_mask
            if main_branch_mask.any():
                
                samples_to_main = samples[main_branch_mask]
                
                main_output = model.forward_exit2(samples_to_main)
                
                main_prob = F.softmax(main_output, dim=1)
                _, main_preds = torch.max(main_prob, 1)
                
                batch_predictions[main_branch_mask] = main_preds

            torch.cuda.synchronize()
            end_time = time.perf_counter()
            total_inference_time += (end_time - start_time)

            all_predictions.append(batch_predictions.cpu())
            all_labels.append(labels.cpu())

    final_predictions = torch.cat(all_predictions)
    y_data = torch.cat(all_labels)

    correct = (final_predictions == y_data).sum().item()
    accuracy = 100 * correct / total_samples
    exit_rate = 100 * exited_early_count / total_samples
    avg_time_ms = (total_inference_time / total_samples) * 1000

    cm = confusion_matrix(y_data.numpy(), final_predictions.numpy())

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Ataque'],
                yticklabels=['Normal', 'Ataque'])
    plt.xlabel('Rótulo Previsto')
    plt.ylabel('Rótulo Verdadeiro')
    plt.title(f'Matriz de Confusão (Limiar de Confiança = {confidence_threshold})')
    plt.show()

    tn, fp, fn, tp = cm.ravel()
    
    f1 = f1_score(y_data.numpy(), final_predictions.numpy())
    
    tpr = recall_score(y_data.numpy(), final_predictions.numpy())

    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    print(f"True Positives (TP): {tp}")
    print(f"True Negatives (TN): {tn}")
    print(f"False Positives (FP): {fp}")
    print(f"False Negatives (FN): {fn}\n")
    
    print(f"F1 Score: {f1:.4f}")
    print(f"True Positive Rate (TPR) / Recall: {tpr:.4f}")
    print(f"True Negative Rate (TNR) / Specificity: {tnr:.4f}")

    return {
            'accuracy': accuracy,
            'exit_rate': exit_rate,
            'avg_inference_time_ms': avg_time_ms,
            'exited_early_count': exited_early_count,
            'total_samples': total_samples,
            'f1': f1
        }
In [44]:
modelname = 'teste_ljoint9'
modelname
Out[44]:
'teste_ljoint9'
In [45]:
epochs = 500

limiar = train_model(
    model, 
    train_loaders, 
    val_loaders, 
    epochs,
    current_threshold=0.55,
    lr=0.0001,
    device=device
)
torch.save(model.state_dict(), f'models/{modelname}.pth')
print(f"\nModelo treinado e salvo em 'models/{modelname}.pth'")
[INIT] --- MODO DEBUG EXTREMO ATIVADO (CORRIGIDO) ---
[INIT] Device: cuda | LR: 0.0001 | Threshold: 0.55

############################## EPOCH 1/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.146
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5263452  0.47365478] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.004
[MASKS] A(Pass/Fail): 4/2044 | B: 107/1941 | C: 260/1788
[LOSS Ex1] A: 0.69271 | B: 0.69443 | C: 0.68963
[LOGITS Ex2 A] Mean Abs: 0.037 | Max: 0.047
[LOSS Ex2] A: 0.69331 | B: 0.69367 | C: 0.69326
** [JOINT LOSS] ** : 1.385670
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000274 | Grad Max: 0.003790
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001401 | Grad Max: 0.006197
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.011667
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.031979 | Grad Max: 0.031979
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000001 | Grad Max: 0.000033
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000015 | Grad Max: 0.000141
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000001 | Grad Max: 0.000054
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000022 | Grad Max: 0.000202
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000001 | Grad Max: 0.000064
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000047 | Grad Max: 0.000366
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000080
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000151 | Grad Max: 0.001195
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000096 | Grad Max: 0.000669
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013826 | Grad Max: 0.013826
[GRADIENT NORM TOTAL] 0.0712

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.148
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51929814 0.48070186] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.004
[MASKS] A(Pass/Fail): 0/2048 | B: 110/1938 | C: 242/1806
[LOSS Ex1] A: 0.00000 | B: 0.69431 | C: 0.68802
[LOGITS Ex2 A] Mean Abs: 0.026 | Max: 0.036
[LOSS Ex2] A: 0.69189 | B: 0.69314 | C: 0.69393
** [JOINT LOSS] ** : 1.153760
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000168 | Grad Max: 0.002961
  -> Layer: shared_layers.0.bias | Grad Mean: 0.000797 | Grad Max: 0.003885
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.000948 | Grad Max: 0.004361
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013003 | Grad Max: 0.013003
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000002 | Grad Max: 0.000042
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000017 | Grad Max: 0.000164
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000001 | Grad Max: 0.000061
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000023 | Grad Max: 0.000220
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000001 | Grad Max: 0.000061
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000048 | Grad Max: 0.000389
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000093
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000151 | Grad Max: 0.001173
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000101 | Grad Max: 0.000873
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014207 | Grad Max: 0.014207
[GRADIENT NORM TOTAL] 0.0406

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.175
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52613604 0.473864  ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 88/1960 | C: 274/1774
[LOSS Ex1] A: 0.69191 | B: 0.69315 | C: 0.68704
[LOGITS Ex2 A] Mean Abs: 0.015 | Max: 0.028
[LOSS Ex2] A: 0.69154 | B: 0.69260 | C: 0.69273
** [JOINT LOSS] ** : 1.382988
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000293 | Grad Max: 0.003905
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001196 | Grad Max: 0.004533
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001968 | Grad Max: 0.011349
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.021428 | Grad Max: 0.021428
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000002 | Grad Max: 0.000051
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000019 | Grad Max: 0.000136
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000001 | Grad Max: 0.000100
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000024 | Grad Max: 0.000342
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000001 | Grad Max: 0.000077
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000045 | Grad Max: 0.000403
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000103
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000127 | Grad Max: 0.001046
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000100 | Grad Max: 0.000727
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011559 | Grad Max: 0.011559
[GRADIENT NORM TOTAL] 0.0605

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.166
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5268466  0.47315347] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 99/1757 | C: 238/1810
[LOSS Ex1] A: 0.69059 | B: 0.69446 | C: 0.69029
[LOGITS Ex2 A] Mean Abs: 0.010 | Max: 0.021
[LOSS Ex2] A: 0.69096 | B: 0.69233 | C: 0.69240
** [JOINT LOSS] ** : 1.383674
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000250 | Grad Max: 0.002665
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001150 | Grad Max: 0.005426
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001719 | Grad Max: 0.009803
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.024030 | Grad Max: 0.024030
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000002 | Grad Max: 0.000066
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000018 | Grad Max: 0.000184
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000001 | Grad Max: 0.000061
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000021 | Grad Max: 0.000189
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000001 | Grad Max: 0.000050
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000035 | Grad Max: 0.000290
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000001 | Grad Max: 0.000077
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000067 | Grad Max: 0.000948
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000067 | Grad Max: 0.000486
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002856 | Grad Max: 0.002856
[GRADIENT NORM TOTAL] 0.0547

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.141
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5166012  0.48339874] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.005
[MASKS] A(Pass/Fail): 2/2046 | B: 107/1941 | C: 236/1812
[LOSS Ex1] A: 0.69022 | B: 0.69415 | C: 0.69074
[LOGITS Ex2 A] Mean Abs: 0.012 | Max: 0.024
[LOSS Ex2] A: 0.68936 | B: 0.69219 | C: 0.69213
** [JOINT LOSS] ** : 1.382931
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000262 | Grad Max: 0.003296
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001157 | Grad Max: 0.005331
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001834 | Grad Max: 0.010335
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.023048 | Grad Max: 0.023048
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000003 | Grad Max: 0.000088
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000025 | Grad Max: 0.000268
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000001 | Grad Max: 0.000066
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000027 | Grad Max: 0.000210
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000001 | Grad Max: 0.000062
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000043 | Grad Max: 0.000396
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000101
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000078 | Grad Max: 0.001290
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000075 | Grad Max: 0.000533
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001089 | Grad Max: 0.001089
[GRADIENT NORM TOTAL] 0.0555

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.140
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.515395   0.48460504] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 110/1938 | C: 241/1807
[LOSS Ex1] A: 0.69316 | B: 0.69404 | C: 0.69074
[LOGITS Ex2 A] Mean Abs: 0.016 | Max: 0.031
[LOSS Ex2] A: 0.68849 | B: 0.69175 | C: 0.69200
** [JOINT LOSS] ** : 1.383394
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000283 | Grad Max: 0.003826
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001676 | Grad Max: 0.007289
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.011786
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.036556 | Grad Max: 0.036556
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000004 | Grad Max: 0.000133
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000049 | Grad Max: 0.000529
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000002 | Grad Max: 0.000105
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000054 | Grad Max: 0.000388
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000002 | Grad Max: 0.000120
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000092 | Grad Max: 0.000808
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000136
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000227 | Grad Max: 0.001978
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000132 | Grad Max: 0.001007
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017831 | Grad Max: 0.017831
[GRADIENT NORM TOTAL] 0.0817

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.171
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5218072  0.47819284] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 88/1960 | C: 240/1808
[LOSS Ex1] A: 0.69208 | B: 0.69287 | C: 0.69087
[LOGITS Ex2 A] Mean Abs: 0.018 | Max: 0.045
[LOSS Ex2] A: 0.68734 | B: 0.69096 | C: 0.69108
** [JOINT LOSS] ** : 1.381736
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000286 | Grad Max: 0.003647
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001519 | Grad Max: 0.006798
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.011025
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.031033 | Grad Max: 0.031033
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000004 | Grad Max: 0.000154
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000051 | Grad Max: 0.000728
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000002 | Grad Max: 0.000102
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000049 | Grad Max: 0.000371
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000002 | Grad Max: 0.000115
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000077 | Grad Max: 0.000714
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000135
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000174 | Grad Max: 0.001524
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000119 | Grad Max: 0.000896
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011617 | Grad Max: 0.011617
[GRADIENT NORM TOTAL] 0.0710

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.097
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5116345  0.48836553] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 98/1758 | C: 264/1784
[LOSS Ex1] A: 0.00000 | B: 0.69418 | C: 0.69090
[LOGITS Ex2 A] Mean Abs: 0.022 | Max: 0.065
[LOSS Ex2] A: 0.68536 | B: 0.69033 | C: 0.69078
** [JOINT LOSS] ** : 1.150513
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000192 | Grad Max: 0.002905
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001436 | Grad Max: 0.006228
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001276 | Grad Max: 0.004786
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.024602 | Grad Max: 0.024602
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000005 | Grad Max: 0.000162
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000049 | Grad Max: 0.000580
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000002 | Grad Max: 0.000108
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000043 | Grad Max: 0.000413
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000002 | Grad Max: 0.000086
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000063 | Grad Max: 0.000583
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000120
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000133 | Grad Max: 0.001281
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000142 | Grad Max: 0.000974
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007047 | Grad Max: 0.007047
[GRADIENT NORM TOTAL] 0.0541

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.107
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51330423 0.4866957 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.005
[MASKS] A(Pass/Fail): 0/1616 | B: 107/1941 | C: 253/1795
[LOSS Ex1] A: 0.00000 | B: 0.69387 | C: 0.68930
[LOGITS Ex2 A] Mean Abs: 0.030 | Max: 0.089
[LOSS Ex2] A: 0.68325 | B: 0.68952 | C: 0.68942
** [JOINT LOSS] ** : 1.148453
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000196 | Grad Max: 0.003207
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001058 | Grad Max: 0.004890
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001090 | Grad Max: 0.004547
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018956 | Grad Max: 0.018956
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000007 | Grad Max: 0.000166
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000060 | Grad Max: 0.000562
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000003 | Grad Max: 0.000149
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000048 | Grad Max: 0.000532
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000002 | Grad Max: 0.000115
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000065 | Grad Max: 0.000572
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000117
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000114 | Grad Max: 0.001075
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000219 | Grad Max: 0.001389
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006146 | Grad Max: 0.006146
[GRADIENT NORM TOTAL] 0.0487

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.150
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52465856 0.47534147] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.005
[MASKS] A(Pass/Fail): 4/2044 | B: 108/1940 | C: 241/1807
[LOSS Ex1] A: 0.69178 | B: 0.69377 | C: 0.68912
[LOGITS Ex2 A] Mean Abs: 0.039 | Max: 0.120
[LOSS Ex2] A: 0.68193 | B: 0.68846 | C: 0.68912
** [JOINT LOSS] ** : 1.378061
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000291 | Grad Max: 0.004099
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001262 | Grad Max: 0.005157
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001883 | Grad Max: 0.010712
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.024290 | Grad Max: 0.024290
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000007 | Grad Max: 0.000185
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000074 | Grad Max: 0.000632
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000003 | Grad Max: 0.000184
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000059 | Grad Max: 0.000491
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000002 | Grad Max: 0.000102
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000075 | Grad Max: 0.000725
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000144
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000127 | Grad Max: 0.001376
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000244 | Grad Max: 0.001452
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006685 | Grad Max: 0.006685
[GRADIENT NORM TOTAL] 0.0648

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.152
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5175542  0.48244575] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 88/1960 | C: 238/1810
[LOSS Ex1] A: 0.00000 | B: 0.69258 | C: 0.68859
[LOGITS Ex2 A] Mean Abs: 0.046 | Max: 0.141
[LOSS Ex2] A: 0.67841 | B: 0.68664 | C: 0.68757
** [JOINT LOSS] ** : 1.144593
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000217 | Grad Max: 0.003782
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001008 | Grad Max: 0.004703
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001011 | Grad Max: 0.004472
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015994 | Grad Max: 0.015994
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000009 | Grad Max: 0.000239
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000079 | Grad Max: 0.000897
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000003 | Grad Max: 0.000161
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000060 | Grad Max: 0.000453
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000002 | Grad Max: 0.000112
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000076 | Grad Max: 0.000701
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000166
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000129 | Grad Max: 0.001627
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000310 | Grad Max: 0.001743
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004185 | Grad Max: 0.004185
[GRADIENT NORM TOTAL] 0.0500

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.176
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5242592  0.47574073] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 97/1759 | C: 243/1805
[LOSS Ex1] A: 0.69096 | B: 0.69390 | C: 0.69062
[LOGITS Ex2 A] Mean Abs: 0.053 | Max: 0.173
[LOSS Ex2] A: 0.67476 | B: 0.68592 | C: 0.68723
** [JOINT LOSS] ** : 1.374464
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000313 | Grad Max: 0.004346
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001491 | Grad Max: 0.006072
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.011524
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.029302 | Grad Max: 0.029302
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000010 | Grad Max: 0.000249
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000090 | Grad Max: 0.000977
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000004 | Grad Max: 0.000200
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000063 | Grad Max: 0.000531
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000003 | Grad Max: 0.000121
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000077 | Grad Max: 0.000625
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000133
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000139 | Grad Max: 0.001316
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000346 | Grad Max: 0.001697
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001239 | Grad Max: 0.001239
[GRADIENT NORM TOTAL] 0.0743

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.165
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52507335 0.47492662] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 106/1942 | C: 245/1803
[LOSS Ex1] A: 0.68993 | B: 0.69359 | C: 0.68994
[LOGITS Ex2 A] Mean Abs: 0.059 | Max: 0.204
[LOSS Ex2] A: 0.67715 | B: 0.68473 | C: 0.68562
** [JOINT LOSS] ** : 1.373654
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000301 | Grad Max: 0.004100
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001447 | Grad Max: 0.006355
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001760 | Grad Max: 0.010040
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.024106 | Grad Max: 0.024106
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000011 | Grad Max: 0.000269
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000101 | Grad Max: 0.001189
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000004 | Grad Max: 0.000170
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000067 | Grad Max: 0.000586
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000003 | Grad Max: 0.000093
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000072 | Grad Max: 0.000492
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000131
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000137 | Grad Max: 0.001063
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000341 | Grad Max: 0.001677
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003772 | Grad Max: 0.003772
[GRADIENT NORM TOTAL] 0.0668

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.140
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5145294  0.48547053] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.005
[MASKS] A(Pass/Fail): 2/2046 | B: 107/1941 | C: 180/1196
[LOSS Ex1] A: 0.68956 | B: 0.69351 | C: 0.68727
[LOGITS Ex2 A] Mean Abs: 0.068 | Max: 0.223
[LOSS Ex2] A: 0.67092 | B: 0.68268 | C: 0.67964
** [JOINT LOSS] ** : 1.367858
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000329 | Grad Max: 0.004891
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001331 | Grad Max: 0.008628
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001793 | Grad Max: 0.010370
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013410 | Grad Max: 0.013410
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000014 | Grad Max: 0.000331
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000138 | Grad Max: 0.001324
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000005 | Grad Max: 0.000254
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000094 | Grad Max: 0.000703
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000003 | Grad Max: 0.000130
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000106 | Grad Max: 0.000687
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000168
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000190 | Grad Max: 0.001495
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000509 | Grad Max: 0.002465
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009612 | Grad Max: 0.009612
[GRADIENT NORM TOTAL] 0.0718

[EPOCH SUMMARY] Train Loss: 1.3137

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.3346 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: inf -> New: 1.3346)

############################## EPOCH 2/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.143
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5113414 0.4886586] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 86/1962 | C: 237/1811
[LOSS Ex1] A: 0.69218 | B: 0.69230 | C: 0.69106
[LOGITS Ex2 A] Mean Abs: 0.084 | Max: 0.262
[LOSS Ex2] A: 0.66491 | B: 0.68147 | C: 0.68477
** [JOINT LOSS] ** : 1.368896
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000359 | Grad Max: 0.004874
  -> Layer: shared_layers.0.bias | Grad Mean: 0.002497 | Grad Max: 0.011833
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002229 | Grad Max: 0.011853
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.036587 | Grad Max: 0.036587
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000018 | Grad Max: 0.000599
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000285 | Grad Max: 0.002927
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000008 | Grad Max: 0.000299
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000215 | Grad Max: 0.001339
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000005 | Grad Max: 0.000181
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000209 | Grad Max: 0.001226
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000249
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000319 | Grad Max: 0.002921
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000410 | Grad Max: 0.002644
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020730 | Grad Max: 0.020730
[GRADIENT NORM TOTAL] 0.1048

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.172
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5200744  0.47992554] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 95/1761 | C: 260/1788
[LOSS Ex1] A: 0.69133 | B: 0.69363 | C: 0.68700
[LOGITS Ex2 A] Mean Abs: 0.095 | Max: 0.309
[LOSS Ex2] A: 0.66574 | B: 0.67965 | C: 0.67877
** [JOINT LOSS] ** : 1.365372
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000357 | Grad Max: 0.005013
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001934 | Grad Max: 0.008305
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001777 | Grad Max: 0.010390
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022877 | Grad Max: 0.022877
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000017 | Grad Max: 0.000538
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000197 | Grad Max: 0.002222
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000006 | Grad Max: 0.000249
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000122 | Grad Max: 0.001103
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000003 | Grad Max: 0.000140
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000116 | Grad Max: 0.000784
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000181
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000191 | Grad Max: 0.002018
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000485 | Grad Max: 0.002031
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006191 | Grad Max: 0.006191
[GRADIENT NORM TOTAL] 0.0786

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.095
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078551  0.49214488] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 106/1942 | C: 219/1829
[LOSS Ex1] A: 0.00000 | B: 0.69333 | C: 0.69038
[LOGITS Ex2 A] Mean Abs: 0.108 | Max: 0.354
[LOSS Ex2] A: 0.66084 | B: 0.67535 | C: 0.67993
** [JOINT LOSS] ** : 1.133273
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000326 | Grad Max: 0.005157
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001947 | Grad Max: 0.009096
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001168 | Grad Max: 0.004467
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022083 | Grad Max: 0.022083
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000019 | Grad Max: 0.000615
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000174 | Grad Max: 0.002597
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000006 | Grad Max: 0.000207
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000093 | Grad Max: 0.000699
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000003 | Grad Max: 0.000103
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000094 | Grad Max: 0.000647
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000192
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000178 | Grad Max: 0.001619
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000550 | Grad Max: 0.002265
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004386 | Grad Max: 0.004386
[GRADIENT NORM TOTAL] 0.0739

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.106
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51048183 0.48951814] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.006
[MASKS] A(Pass/Fail): 0/1616 | B: 106/1942 | C: 240/1808
[LOSS Ex1] A: 0.00000 | B: 0.69326 | C: 0.68914
[LOGITS Ex2 A] Mean Abs: 0.129 | Max: 0.422
[LOSS Ex2] A: 0.65776 | B: 0.67115 | C: 0.67434
** [JOINT LOSS] ** : 1.128550
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000388 | Grad Max: 0.005712
  -> Layer: shared_layers.0.bias | Grad Mean: 0.001840 | Grad Max: 0.013325
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001052 | Grad Max: 0.004318
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018871 | Grad Max: 0.018871
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000023 | Grad Max: 0.000610
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000260 | Grad Max: 0.002804
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000008 | Grad Max: 0.000307
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000155 | Grad Max: 0.001229
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000004 | Grad Max: 0.000132
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000134 | Grad Max: 0.000813
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000201
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000186 | Grad Max: 0.001409
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000680 | Grad Max: 0.002762
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009435 | Grad Max: 0.009435
[GRADIENT NORM TOTAL] 0.0854

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.154
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5230074  0.47699255] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 86/1962 | C: 239/1809
[LOSS Ex1] A: 0.69094 | B: 0.69204 | C: 0.68893
[LOGITS Ex2 A] Mean Abs: 0.149 | Max: 0.474
[LOSS Ex2] A: 0.65880 | B: 0.66525 | C: 0.67321
** [JOINT LOSS] ** : 1.356386
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000487 | Grad Max: 0.008658
  -> Layer: shared_layers.0.bias | Grad Mean: 0.002279 | Grad Max: 0.018578
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001898 | Grad Max: 0.010941
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.024539 | Grad Max: 0.024539
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000026 | Grad Max: 0.000699
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000266 | Grad Max: 0.003601
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000008 | Grad Max: 0.000293
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000144 | Grad Max: 0.001029
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000004 | Grad Max: 0.000134
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000116 | Grad Max: 0.000730
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000189
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000165 | Grad Max: 0.001357
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000665 | Grad Max: 0.002914
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008034 | Grad Max: 0.008034
[GRADIENT NORM TOTAL] 0.0979

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.155
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51590544 0.48409456] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 95/1761 | C: 241/1807
[LOSS Ex1] A: 0.00000 | B: 0.69338 | C: 0.68797
[LOGITS Ex2 A] Mean Abs: 0.162 | Max: 0.555
[LOSS Ex2] A: 0.64760 | B: 0.66422 | C: 0.66446
** [JOINT LOSS] ** : 1.119212
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000468 | Grad Max: 0.007503
  -> Layer: shared_layers.0.bias | Grad Mean: 0.002450 | Grad Max: 0.014209
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001095 | Grad Max: 0.004676
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019605 | Grad Max: 0.019605
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000028 | Grad Max: 0.000889
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000253 | Grad Max: 0.003051
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000008 | Grad Max: 0.000294
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000122 | Grad Max: 0.000881
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000004 | Grad Max: 0.000120
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000100 | Grad Max: 0.000659
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000159
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000168 | Grad Max: 0.001036
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000786 | Grad Max: 0.002942
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001387 | Grad Max: 0.001387
[GRADIENT NORM TOTAL] 0.0925

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.177
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5224882  0.47751182] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 105/1943 | C: 251/1797
[LOSS Ex1] A: 0.69011 | B: 0.69308 | C: 0.68847
[LOGITS Ex2 A] Mean Abs: 0.179 | Max: 0.602
[LOSS Ex2] A: 0.64161 | B: 0.66106 | C: 0.66357
** [JOINT LOSS] ** : 1.345968
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000564 | Grad Max: 0.010050
  -> Layer: shared_layers.0.bias | Grad Mean: 0.002798 | Grad Max: 0.023381
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.011440
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.027074 | Grad Max: 0.027074
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000031 | Grad Max: 0.001052
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000268 | Grad Max: 0.003639
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000008 | Grad Max: 0.000250
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000123 | Grad Max: 0.000845
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000004 | Grad Max: 0.000139
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000094 | Grad Max: 0.000635
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000148
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000155 | Grad Max: 0.001062
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000801 | Grad Max: 0.002699
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000806 | Grad Max: 0.000806
[GRADIENT NORM TOTAL] 0.1087

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.165
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5234131  0.47658685] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 103/1945 | C: 232/1816
[LOSS Ex1] A: 0.68935 | B: 0.69303 | C: 0.68768
[LOGITS Ex2 A] Mean Abs: 0.195 | Max: 0.703
[LOSS Ex2] A: 0.65306 | B: 0.65702 | C: 0.65335
** [JOINT LOSS] ** : 1.344498
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000653 | Grad Max: 0.011215
  -> Layer: shared_layers.0.bias | Grad Mean: 0.002823 | Grad Max: 0.027862
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001713 | Grad Max: 0.010119
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017481 | Grad Max: 0.017481
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000039 | Grad Max: 0.001102
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000337 | Grad Max: 0.004805
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000010 | Grad Max: 0.000416
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000165 | Grad Max: 0.001052
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000005 | Grad Max: 0.000135
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000108 | Grad Max: 0.000777
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000165
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000138 | Grad Max: 0.000962
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000747 | Grad Max: 0.003018
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008489 | Grad Max: 0.008489
[GRADIENT NORM TOTAL] 0.1152

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.139
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5125335 0.4874665] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 82/1966 | C: 245/1803
[LOSS Ex1] A: 0.00000 | B: 0.69179 | C: 0.68820
[LOGITS Ex2 A] Mean Abs: 0.214 | Max: 0.848
[LOSS Ex2] A: 0.64094 | B: 0.64954 | C: 0.65725
** [JOINT LOSS] ** : 1.109238
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000714 | Grad Max: 0.013491
  -> Layer: shared_layers.0.bias | Grad Mean: 0.003627 | Grad Max: 0.027762
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001089 | Grad Max: 0.004856
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018156 | Grad Max: 0.018156
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000045 | Grad Max: 0.001489
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000366 | Grad Max: 0.006041
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000011 | Grad Max: 0.000327
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000151 | Grad Max: 0.000936
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000005 | Grad Max: 0.000121
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000096 | Grad Max: 0.000677
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000154
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000124 | Grad Max: 0.000873
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000747 | Grad Max: 0.002652
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000234 | Grad Max: 0.000234
[GRADIENT NORM TOTAL] 0.1209

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.146
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50782883 0.49217114] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 92/1764 | C: 218/1830
[LOSS Ex1] A: 0.69133 | B: 0.69314 | C: 0.69041
[LOGITS Ex2 A] Mean Abs: 0.249 | Max: 0.930
[LOSS Ex2] A: 0.63653 | B: 0.64502 | C: 0.65675
** [JOINT LOSS] ** : 1.337726
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000754 | Grad Max: 0.013033
  -> Layer: shared_layers.0.bias | Grad Mean: 0.006699 | Grad Max: 0.038463
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002228 | Grad Max: 0.011912
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.035176 | Grad Max: 0.035176
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000065 | Grad Max: 0.002250
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000875 | Grad Max: 0.007947
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000021 | Grad Max: 0.000736
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000520 | Grad Max: 0.002573
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000263
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000320 | Grad Max: 0.001422
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000265
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000309 | Grad Max: 0.001756
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000641 | Grad Max: 0.003498
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015760 | Grad Max: 0.015760
[GRADIENT NORM TOTAL] 0.1811

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.172
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51857543 0.48142457] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 104/1944 | C: 220/1828
[LOSS Ex1] A: 0.69071 | B: 0.69284 | C: 0.68744
[LOGITS Ex2 A] Mean Abs: 0.260 | Max: 0.955
[LOSS Ex2] A: 0.64911 | B: 0.63519 | C: 0.64405
** [JOINT LOSS] ** : 1.333116
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000956 | Grad Max: 0.023314
  -> Layer: shared_layers.0.bias | Grad Mean: 0.006703 | Grad Max: 0.046837
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001776 | Grad Max: 0.010218
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022812 | Grad Max: 0.022812
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000070 | Grad Max: 0.002128
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000985 | Grad Max: 0.006912
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000023 | Grad Max: 0.000883
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000614 | Grad Max: 0.003363
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000271
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000370 | Grad Max: 0.001557
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000329
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000345 | Grad Max: 0.001766
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001012 | Grad Max: 0.005338
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021675 | Grad Max: 0.021675
[GRADIENT NORM TOTAL] 0.2067

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.093
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50461024 0.4953898 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 102/1946 | C: 213/1835
[LOSS Ex1] A: 0.00000 | B: 0.69281 | C: 0.68740
[LOGITS Ex2 A] Mean Abs: 0.271 | Max: 1.035
[LOSS Ex2] A: 0.63825 | B: 0.62868 | C: 0.64292
** [JOINT LOSS] ** : 1.096682
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000990 | Grad Max: 0.026298
  -> Layer: shared_layers.0.bias | Grad Mean: 0.007545 | Grad Max: 0.046133
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.000968 | Grad Max: 0.004414
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014022 | Grad Max: 0.014022
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000074 | Grad Max: 0.002278
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001077 | Grad Max: 0.008500
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000024 | Grad Max: 0.000744
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000642 | Grad Max: 0.002937
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000268
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000367 | Grad Max: 0.001512
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000307
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000331 | Grad Max: 0.001571
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001000 | Grad Max: 0.004966
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019862 | Grad Max: 0.019862
[GRADIENT NORM TOTAL] 0.2134

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.105
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082936  0.49170634] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.006
[MASKS] A(Pass/Fail): 0/1616 | B: 82/1966 | C: 231/1817
[LOSS Ex1] A: 0.00000 | B: 0.69156 | C: 0.68627
[LOGITS Ex2 A] Mean Abs: 0.285 | Max: 1.057
[LOSS Ex2] A: 0.61686 | B: 0.63328 | C: 0.63068
** [JOINT LOSS] ** : 1.086216
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000937 | Grad Max: 0.019605
  -> Layer: shared_layers.0.bias | Grad Mean: 0.008149 | Grad Max: 0.049096
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001001 | Grad Max: 0.004859
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011017 | Grad Max: 0.011017
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000080 | Grad Max: 0.002448
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001061 | Grad Max: 0.010500
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000024 | Grad Max: 0.000803
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000580 | Grad Max: 0.002843
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000267
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000323 | Grad Max: 0.001212
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000247
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000281 | Grad Max: 0.001364
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000852 | Grad Max: 0.003998
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012960 | Grad Max: 0.012960
[GRADIENT NORM TOTAL] 0.2015

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.158
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5215471 0.4784529] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.006
[MASKS] A(Pass/Fail): 2/2046 | B: 91/1765 | C: 176/1200
[LOSS Ex1] A: 0.69033 | B: 0.69291 | C: 0.68661
[LOGITS Ex2 A] Mean Abs: 0.294 | Max: 1.129
[LOSS Ex2] A: 0.61353 | B: 0.63024 | C: 0.63309
** [JOINT LOSS] ** : 1.315575
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.000990 | Grad Max: 0.017834
  -> Layer: shared_layers.0.bias | Grad Mean: 0.013137 | Grad Max: 0.071255
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001915 | Grad Max: 0.011159
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022340 | Grad Max: 0.022340
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000123 | Grad Max: 0.003386
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002015 | Grad Max: 0.014653
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.001133
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001244 | Grad Max: 0.005344
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000428
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000682 | Grad Max: 0.002480
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000456
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000581 | Grad Max: 0.002241
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001536 | Grad Max: 0.005940
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030050 | Grad Max: 0.030050
[GRADIENT NORM TOTAL] 0.3268

[EPOCH SUMMARY] Train Loss: 1.2458

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.2776 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.3346 -> New: 1.2776)

############################## EPOCH 3/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.158
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5145997  0.48540035] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 104/1944 | C: 218/1830
[LOSS Ex1] A: 0.00000 | B: 0.69263 | C: 0.68681
[LOGITS Ex2 A] Mean Abs: 0.303 | Max: 1.372
[LOSS Ex2] A: 0.61768 | B: 0.61410 | C: 0.63222
** [JOINT LOSS] ** : 1.081145
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001117 | Grad Max: 0.027060
  -> Layer: shared_layers.0.bias | Grad Mean: 0.008944 | Grad Max: 0.043180
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.000986 | Grad Max: 0.004560
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014544 | Grad Max: 0.014544
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000086 | Grad Max: 0.002419
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001179 | Grad Max: 0.007743
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000024 | Grad Max: 0.000815
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000622 | Grad Max: 0.003297
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000223
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000317 | Grad Max: 0.001322
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000249
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000256 | Grad Max: 0.001203
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001030 | Grad Max: 0.005488
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013948 | Grad Max: 0.013948
[GRADIENT NORM TOTAL] 0.2223

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.178
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5210887  0.47891128] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 102/1946 | C: 225/1823
[LOSS Ex1] A: 0.68955 | B: 0.69260 | C: 0.68444
[LOGITS Ex2 A] Mean Abs: 0.315 | Max: 1.297
[LOSS Ex2] A: 0.61685 | B: 0.60470 | C: 0.62445
** [JOINT LOSS] ** : 1.304197
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001258 | Grad Max: 0.029777
  -> Layer: shared_layers.0.bias | Grad Mean: 0.020291 | Grad Max: 0.091638
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001804 | Grad Max: 0.010589
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014122 | Grad Max: 0.014122
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000176 | Grad Max: 0.003858
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.003068 | Grad Max: 0.018550
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000064 | Grad Max: 0.002007
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001877 | Grad Max: 0.008301
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000645
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000975 | Grad Max: 0.003402
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000558
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000779 | Grad Max: 0.002754
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002554 | Grad Max: 0.010251
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041153 | Grad Max: 0.041153
[GRADIENT NORM TOTAL] 0.4838

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.165
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52205557 0.47794446] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 81/1967 | C: 240/1808
[LOSS Ex1] A: 0.68900 | B: 0.69134 | C: 0.68692
[LOGITS Ex2 A] Mean Abs: 0.293 | Max: 1.321
[LOSS Ex2] A: 0.61256 | B: 0.60469 | C: 0.61878
** [JOINT LOSS] ** : 1.301098
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001210 | Grad Max: 0.022391
  -> Layer: shared_layers.0.bias | Grad Mean: 0.007742 | Grad Max: 0.056442
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001625 | Grad Max: 0.009413
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013874 | Grad Max: 0.013874
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000090 | Grad Max: 0.002347
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000993 | Grad Max: 0.011694
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000023 | Grad Max: 0.000746
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000519 | Grad Max: 0.002536
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000296
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000262 | Grad Max: 0.001324
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000282
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000208 | Grad Max: 0.001215
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001057 | Grad Max: 0.004575
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009451 | Grad Max: 0.009451
[GRADIENT NORM TOTAL] 0.2120

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.139
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51104176 0.48895824] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 91/1765 | C: 234/1814
[LOSS Ex1] A: 0.00000 | B: 0.69271 | C: 0.68861
[LOGITS Ex2 A] Mean Abs: 0.306 | Max: 1.339
[LOSS Ex2] A: 0.60089 | B: 0.61249 | C: 0.62518
** [JOINT LOSS] ** : 1.073291
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001269 | Grad Max: 0.024607
  -> Layer: shared_layers.0.bias | Grad Mean: 0.024117 | Grad Max: 0.112984
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001250 | Grad Max: 0.004946
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.023051 | Grad Max: 0.023051
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000229 | Grad Max: 0.005544
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004116 | Grad Max: 0.026181
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.002230
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002505 | Grad Max: 0.010020
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000732
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001258 | Grad Max: 0.003782
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000835
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000966 | Grad Max: 0.003472
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003207 | Grad Max: 0.009636
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048195 | Grad Max: 0.048195
[GRADIENT NORM TOTAL] 0.6019

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.148
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50557864 0.49442136] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 104/1944 | C: 219/1829
[LOSS Ex1] A: 0.69081 | B: 0.69243 | C: 0.68907
[LOGITS Ex2 A] Mean Abs: 0.314 | Max: 1.458
[LOSS Ex2] A: 0.60321 | B: 0.60062 | C: 0.61624
** [JOINT LOSS] ** : 1.297456
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001308 | Grad Max: 0.017819
  -> Layer: shared_layers.0.bias | Grad Mean: 0.005287 | Grad Max: 0.026603
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.011482
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.031404 | Grad Max: 0.031404
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000078 | Grad Max: 0.002195
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.000657 | Grad Max: 0.011530
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000017 | Grad Max: 0.000647
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000214 | Grad Max: 0.001668
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000222
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000102 | Grad Max: 0.000801
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000222
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000082 | Grad Max: 0.000702
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001070 | Grad Max: 0.003596
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003087 | Grad Max: 0.003087
[GRADIENT NORM TOTAL] 0.1833

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.173
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51748586 0.4825141 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 102/1946 | C: 241/1807
[LOSS Ex1] A: 0.69033 | B: 0.69241 | C: 0.68662
[LOGITS Ex2 A] Mean Abs: 0.340 | Max: 1.549
[LOSS Ex2] A: 0.62305 | B: 0.58884 | C: 0.61141
** [JOINT LOSS] ** : 1.297554
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001718 | Grad Max: 0.044808
  -> Layer: shared_layers.0.bias | Grad Mean: 0.037290 | Grad Max: 0.163126
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001766 | Grad Max: 0.010041
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.020341 | Grad Max: 0.020341
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000326 | Grad Max: 0.006887
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005811 | Grad Max: 0.036221
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.002961
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003484 | Grad Max: 0.013855
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.001001
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001671 | Grad Max: 0.005293
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000056 | Grad Max: 0.000973
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001223 | Grad Max: 0.004492
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004451 | Grad Max: 0.015220
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.061124 | Grad Max: 0.061124
[GRADIENT NORM TOTAL] 0.8594

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.091
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025032 0.4974968] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 81/1967 | C: 231/1817
[LOSS Ex1] A: 0.00000 | B: 0.69114 | C: 0.68912
[LOGITS Ex2 A] Mean Abs: 0.312 | Max: 1.625
[LOSS Ex2] A: 0.59467 | B: 0.58459 | C: 0.59669
** [JOINT LOSS] ** : 1.052070
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001361 | Grad Max: 0.022746
  -> Layer: shared_layers.0.bias | Grad Mean: 0.012618 | Grad Max: 0.055932
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001151 | Grad Max: 0.004817
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.020524 | Grad Max: 0.020524
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000122 | Grad Max: 0.003681
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001762 | Grad Max: 0.014622
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000034 | Grad Max: 0.000985
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000902 | Grad Max: 0.004056
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000345
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000420 | Grad Max: 0.001755
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000312
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000297 | Grad Max: 0.001273
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001474 | Grad Max: 0.006018
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014891 | Grad Max: 0.014891
[GRADIENT NORM TOTAL] 0.2978

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.105
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070116  0.49298844] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.006
[MASKS] A(Pass/Fail): 0/1616 | B: 90/1766 | C: 234/1814
[LOSS Ex1] A: 0.00000 | B: 0.69251 | C: 0.68761
[LOGITS Ex2 A] Mean Abs: 0.327 | Max: 1.453
[LOSS Ex2] A: 0.58418 | B: 0.58977 | C: 0.60109
** [JOINT LOSS] ** : 1.051720
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001488 | Grad Max: 0.026754
  -> Layer: shared_layers.0.bias | Grad Mean: 0.024055 | Grad Max: 0.116033
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001081 | Grad Max: 0.004577
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018517 | Grad Max: 0.018517
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000230 | Grad Max: 0.006011
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004226 | Grad Max: 0.025914
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000085 | Grad Max: 0.002168
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002470 | Grad Max: 0.010328
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000794
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001151 | Grad Max: 0.004022
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000774
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000819 | Grad Max: 0.003145
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002899 | Grad Max: 0.009886
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039225 | Grad Max: 0.039225
[GRADIENT NORM TOTAL] 0.5902

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.161
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5204957 0.4795043] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 104/1944 | C: 220/1828
[LOSS Ex1] A: 0.68993 | B: 0.69223 | C: 0.68774
[LOGITS Ex2 A] Mean Abs: 0.333 | Max: 1.633
[LOSS Ex2] A: 0.58853 | B: 0.59361 | C: 0.60708
** [JOINT LOSS] ** : 1.286374
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001661 | Grad Max: 0.034253
  -> Layer: shared_layers.0.bias | Grad Mean: 0.024894 | Grad Max: 0.122559
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001852 | Grad Max: 0.010724
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022579 | Grad Max: 0.022579
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000243 | Grad Max: 0.006546
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004403 | Grad Max: 0.029231
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000089 | Grad Max: 0.002252
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002558 | Grad Max: 0.010325
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000722
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001176 | Grad Max: 0.003667
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000733
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000827 | Grad Max: 0.002918
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003056 | Grad Max: 0.009499
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039476 | Grad Max: 0.039476
[GRADIENT NORM TOTAL] 0.6137

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.160
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51377565 0.48622432] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 100/1948 | C: 230/1818
[LOSS Ex1] A: 0.00000 | B: 0.69222 | C: 0.68658
[LOGITS Ex2 A] Mean Abs: 0.334 | Max: 1.700
[LOSS Ex2] A: 0.58790 | B: 0.57700 | C: 0.58962
** [JOINT LOSS] ** : 1.044441
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001341 | Grad Max: 0.025257
  -> Layer: shared_layers.0.bias | Grad Mean: 0.020703 | Grad Max: 0.094050
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001009 | Grad Max: 0.004691
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012883 | Grad Max: 0.012883
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000185 | Grad Max: 0.004520
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002999 | Grad Max: 0.020306
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.001471
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001586 | Grad Max: 0.006720
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000398
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000707 | Grad Max: 0.002312
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000435
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000479 | Grad Max: 0.001677
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001858 | Grad Max: 0.007260
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023408 | Grad Max: 0.023408
[GRADIENT NORM TOTAL] 0.4471

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.179
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52012324 0.4798768 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.007
[MASKS] A(Pass/Fail): 2/2046 | B: 80/1968 | C: 220/1828
[LOSS Ex1] A: 0.68915 | B: 0.69094 | C: 0.68885
[LOGITS Ex2 A] Mean Abs: 0.347 | Max: 1.712
[LOSS Ex2] A: 0.59167 | B: 0.57734 | C: 0.59006
** [JOINT LOSS] ** : 1.276003
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001523 | Grad Max: 0.027598
  -> Layer: shared_layers.0.bias | Grad Mean: 0.030497 | Grad Max: 0.134791
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002037 | Grad Max: 0.011451
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.025775 | Grad Max: 0.025775
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000270 | Grad Max: 0.006310
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004598 | Grad Max: 0.030058
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.002004
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002561 | Grad Max: 0.008877
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000598
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001126 | Grad Max: 0.003385
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000632
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000756 | Grad Max: 0.002537
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002961 | Grad Max: 0.010046
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036543 | Grad Max: 0.036543
[GRADIENT NORM TOTAL] 0.6644

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.165
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5210129  0.47898713] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 90/1766 | C: 250/1798
[LOSS Ex1] A: 0.68873 | B: 0.69232 | C: 0.68858
[LOGITS Ex2 A] Mean Abs: 0.338 | Max: 1.751
[LOSS Ex2] A: 0.58697 | B: 0.56462 | C: 0.56497
** [JOINT LOSS] ** : 1.262064
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001372 | Grad Max: 0.025513
  -> Layer: shared_layers.0.bias | Grad Mean: 0.008341 | Grad Max: 0.036859
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001845 | Grad Max: 0.010371
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.023509 | Grad Max: 0.023509
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000096 | Grad Max: 0.003008
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001055 | Grad Max: 0.011698
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000022 | Grad Max: 0.000734
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000441 | Grad Max: 0.002556
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000215
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000189 | Grad Max: 0.001069
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000270
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000125 | Grad Max: 0.001113
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001474 | Grad Max: 0.004824
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006138 | Grad Max: 0.006138
[GRADIENT NORM TOTAL] 0.2260

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.138
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50984466 0.49015537] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 101/1947 | C: 250/1798
[LOSS Ex1] A: 0.00000 | B: 0.69204 | C: 0.68603
[LOGITS Ex2 A] Mean Abs: 0.347 | Max: 1.791
[LOSS Ex2] A: 0.57763 | B: 0.57639 | C: 0.58617
** [JOINT LOSS] ** : 1.039423
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001509 | Grad Max: 0.031986
  -> Layer: shared_layers.0.bias | Grad Mean: 0.019508 | Grad Max: 0.100065
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001028 | Grad Max: 0.004720
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013638 | Grad Max: 0.013638
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000198 | Grad Max: 0.005656
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.003623 | Grad Max: 0.023917
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000069 | Grad Max: 0.001876
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001999 | Grad Max: 0.007842
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000548
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000871 | Grad Max: 0.002663
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000491
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000578 | Grad Max: 0.002022
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002141 | Grad Max: 0.006724
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026595 | Grad Max: 0.026595
[GRADIENT NORM TOTAL] 0.4825

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.150
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5036451 0.4963549] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 83/1965 | C: 158/1218
[LOSS Ex1] A: 0.69035 | B: 0.69204 | C: 0.68829
[LOGITS Ex2 A] Mean Abs: 0.371 | Max: 1.885
[LOSS Ex2] A: 0.57115 | B: 0.57244 | C: 0.59703
** [JOINT LOSS] ** : 1.270436
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001811 | Grad Max: 0.033460
  -> Layer: shared_layers.0.bias | Grad Mean: 0.029756 | Grad Max: 0.151382
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.011944
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.030981 | Grad Max: 0.030981
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000278 | Grad Max: 0.007220
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005299 | Grad Max: 0.031937
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000099 | Grad Max: 0.002684
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002945 | Grad Max: 0.010801
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000687
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001249 | Grad Max: 0.003735
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000673
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000807 | Grad Max: 0.002920
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003074 | Grad Max: 0.009367
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036744 | Grad Max: 0.036744
[GRADIENT NORM TOTAL] 0.6960

[EPOCH SUMMARY] Train Loss: 1.1884

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.1823 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.2776 -> New: 1.1823)

############################## EPOCH 4/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.173
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51658    0.48341998] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.007
[MASKS] A(Pass/Fail): 3/2045 | B: 71/1977 | C: 235/1813
[LOSS Ex1] A: 0.68998 | B: 0.69075 | C: 0.68936
[LOGITS Ex2 A] Mean Abs: 0.385 | Max: 1.946
[LOSS Ex2] A: 0.58553 | B: 0.56031 | C: 0.56623
** [JOINT LOSS] ** : 1.260717
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001726 | Grad Max: 0.031083
  -> Layer: shared_layers.0.bias | Grad Mean: 0.027957 | Grad Max: 0.127689
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001953 | Grad Max: 0.010791
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.027551 | Grad Max: 0.027551
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000277 | Grad Max: 0.006692
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004569 | Grad Max: 0.033027
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.002430
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002521 | Grad Max: 0.010631
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000661
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001054 | Grad Max: 0.003118
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000552
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000665 | Grad Max: 0.002291
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002772 | Grad Max: 0.009569
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031114 | Grad Max: 0.031114
[GRADIENT NORM TOTAL] 0.6376

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.090
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50045496 0.49954507] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 83/1773 | C: 236/1812
[LOSS Ex1] A: 0.00000 | B: 0.69213 | C: 0.68762
[LOGITS Ex2 A] Mean Abs: 0.404 | Max: 2.173
[LOSS Ex2] A: 0.57409 | B: 0.55181 | C: 0.56259
** [JOINT LOSS] ** : 1.022748
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001769 | Grad Max: 0.038191
  -> Layer: shared_layers.0.bias | Grad Mean: 0.039509 | Grad Max: 0.184904
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001160 | Grad Max: 0.004580
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019888 | Grad Max: 0.019888
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000365 | Grad Max: 0.008395
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006438 | Grad Max: 0.044898
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000121 | Grad Max: 0.002795
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003555 | Grad Max: 0.013272
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000788
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001469 | Grad Max: 0.004225
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000802
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000916 | Grad Max: 0.003244
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003751 | Grad Max: 0.011937
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042486 | Grad Max: 0.042486
[GRADIENT NORM TOTAL] 0.8800

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.104
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.505698   0.49430197] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.006
[MASKS] A(Pass/Fail): 0/1616 | B: 87/1961 | C: 201/1847
[LOSS Ex1] A: 0.00000 | B: 0.69186 | C: 0.68813
[LOGITS Ex2 A] Mean Abs: 0.418 | Max: 1.984
[LOSS Ex2] A: 0.55556 | B: 0.56915 | C: 0.55717
** [JOINT LOSS] ** : 1.020622
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001315 | Grad Max: 0.023442
  -> Layer: shared_layers.0.bias | Grad Mean: 0.015611 | Grad Max: 0.080299
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001144 | Grad Max: 0.004641
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018053 | Grad Max: 0.018053
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000142 | Grad Max: 0.006183
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002706 | Grad Max: 0.033417
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.001814
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001325 | Grad Max: 0.007845
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000348
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000535 | Grad Max: 0.001862
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000449
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000329 | Grad Max: 0.001518
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001261 | Grad Max: 0.005177
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014203 | Grad Max: 0.014203
[GRADIENT NORM TOTAL] 0.3564

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.164
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5194731 0.4805269] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 81/1967 | C: 216/1832
[LOSS Ex1] A: 0.68950 | B: 0.69187 | C: 0.68695
[LOGITS Ex2 A] Mean Abs: 0.440 | Max: 2.143
[LOSS Ex2] A: 0.56528 | B: 0.56719 | C: 0.57345
** [JOINT LOSS] ** : 1.258078
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001924 | Grad Max: 0.036528
  -> Layer: shared_layers.0.bias | Grad Mean: 0.056701 | Grad Max: 0.273019
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001859 | Grad Max: 0.010231
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019777 | Grad Max: 0.019777
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000483 | Grad Max: 0.011386
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009224 | Grad Max: 0.054606
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000164 | Grad Max: 0.003693
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004907 | Grad Max: 0.018130
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.001111
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001970 | Grad Max: 0.005466
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000060 | Grad Max: 0.001019
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001199 | Grad Max: 0.004081
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004711 | Grad Max: 0.012029
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053672 | Grad Max: 0.053672
[GRADIENT NORM TOTAL] 1.1941

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.162
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51299936 0.48700064] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 59/1989 | C: 243/1805
[LOSS Ex1] A: 0.00000 | B: 0.69057 | C: 0.68517
[LOGITS Ex2 A] Mean Abs: 0.437 | Max: 2.274
[LOSS Ex2] A: 0.54859 | B: 0.54767 | C: 0.55146
** [JOINT LOSS] ** : 1.007817
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001277 | Grad Max: 0.023969
  -> Layer: shared_layers.0.bias | Grad Mean: 0.012185 | Grad Max: 0.059310
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001004 | Grad Max: 0.005354
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010316 | Grad Max: 0.010316
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000113 | Grad Max: 0.006230
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002045 | Grad Max: 0.033078
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000032 | Grad Max: 0.001142
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000906 | Grad Max: 0.004793
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000268
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000356 | Grad Max: 0.001337
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000262
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000215 | Grad Max: 0.000992
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001023 | Grad Max: 0.003959
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008919 | Grad Max: 0.008919
[GRADIENT NORM TOTAL] 0.2772

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.178
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51910555 0.48089442] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.007
[MASKS] A(Pass/Fail): 2/2046 | B: 73/1783 | C: 200/1848
[LOSS Ex1] A: 0.68872 | B: 0.69195 | C: 0.68762
[LOGITS Ex2 A] Mean Abs: 0.454 | Max: 2.443
[LOSS Ex2] A: 0.56003 | B: 0.54669 | C: 0.57373
** [JOINT LOSS] ** : 1.249581
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.038854
  -> Layer: shared_layers.0.bias | Grad Mean: 0.053131 | Grad Max: 0.252947
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001988 | Grad Max: 0.011387
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.023867 | Grad Max: 0.023867
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000489 | Grad Max: 0.011179
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008584 | Grad Max: 0.059600
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000156 | Grad Max: 0.003560
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004578 | Grad Max: 0.016276
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000872
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001801 | Grad Max: 0.005114
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000055 | Grad Max: 0.000867
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001071 | Grad Max: 0.003381
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004479 | Grad Max: 0.012335
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048533 | Grad Max: 0.048533
[GRADIENT NORM TOTAL] 1.1424

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.164
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5199457 0.4800543] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 79/1969 | C: 207/1841
[LOSS Ex1] A: 0.68842 | B: 0.69167 | C: 0.68823
[LOGITS Ex2 A] Mean Abs: 0.450 | Max: 2.416
[LOSS Ex2] A: 0.56217 | B: 0.55174 | C: 0.58116
** [JOINT LOSS] ** : 1.254463
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.041599
  -> Layer: shared_layers.0.bias | Grad Mean: 0.040599 | Grad Max: 0.207744
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001660 | Grad Max: 0.009309
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015902 | Grad Max: 0.015902
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000388 | Grad Max: 0.009591
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006709 | Grad Max: 0.046680
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.003122
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003549 | Grad Max: 0.012660
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000798
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001389 | Grad Max: 0.004095
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000698
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000818 | Grad Max: 0.002705
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003449 | Grad Max: 0.009603
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036818 | Grad Max: 0.036818
[GRADIENT NORM TOTAL] 0.8925

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.137
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50873 0.49127] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 75/1973 | C: 206/1842
[LOSS Ex1] A: 0.00000 | B: 0.69170 | C: 0.68738
[LOGITS Ex2 A] Mean Abs: 0.462 | Max: 2.251
[LOSS Ex2] A: 0.55596 | B: 0.55642 | C: 0.55643
** [JOINT LOSS] ** : 1.015962
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001726 | Grad Max: 0.034641
  -> Layer: shared_layers.0.bias | Grad Mean: 0.056795 | Grad Max: 0.277652
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001094 | Grad Max: 0.004642
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017595 | Grad Max: 0.017595
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000475 | Grad Max: 0.011343
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009156 | Grad Max: 0.062887
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000156 | Grad Max: 0.003978
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004721 | Grad Max: 0.017240
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000917
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001816 | Grad Max: 0.005069
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000949
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001060 | Grad Max: 0.003521
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004225 | Grad Max: 0.010555
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046548 | Grad Max: 0.046548
[GRADIENT NORM TOTAL] 1.1636

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.151
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014873  0.49851266] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 58/1990 | C: 228/1820
[LOSS Ex1] A: 0.68987 | B: 0.69039 | C: 0.68709
[LOGITS Ex2 A] Mean Abs: 0.483 | Max: 2.233
[LOSS Ex2] A: 0.55310 | B: 0.55903 | C: 0.57008
** [JOINT LOSS] ** : 1.249853
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001954 | Grad Max: 0.038801
  -> Layer: shared_layers.0.bias | Grad Mean: 0.068409 | Grad Max: 0.330292
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.011074
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.027013 | Grad Max: 0.027013
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000565 | Grad Max: 0.014768
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010946 | Grad Max: 0.069838
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000186 | Grad Max: 0.004321
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005630 | Grad Max: 0.019441
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.001084
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002147 | Grad Max: 0.005904
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000062 | Grad Max: 0.000985
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001244 | Grad Max: 0.004048
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004986 | Grad Max: 0.013220
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.055020 | Grad Max: 0.055020
[GRADIENT NORM TOTAL] 1.3982

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.173
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5156842 0.4843158] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.007
[MASKS] A(Pass/Fail): 3/2045 | B: 73/1783 | C: 252/1796
[LOSS Ex1] A: 0.68961 | B: 0.69177 | C: 0.68292
[LOGITS Ex2 A] Mean Abs: 0.469 | Max: 2.476
[LOSS Ex2] A: 0.56204 | B: 0.53634 | C: 0.54646
** [JOINT LOSS] ** : 1.236382
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001420 | Grad Max: 0.026983
  -> Layer: shared_layers.0.bias | Grad Mean: 0.007225 | Grad Max: 0.040864
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001751 | Grad Max: 0.009898
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013926 | Grad Max: 0.013926
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000119 | Grad Max: 0.004526
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001314 | Grad Max: 0.024233
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000023 | Grad Max: 0.000909
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000395 | Grad Max: 0.002627
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000190
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000147 | Grad Max: 0.001023
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000201
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000082 | Grad Max: 0.000513
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000574 | Grad Max: 0.002821
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004037 | Grad Max: 0.004037
[GRADIENT NORM TOTAL] 0.2208

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.089
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016016  0.49839842] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 78/1970 | C: 204/1844
[LOSS Ex1] A: 0.00000 | B: 0.69150 | C: 0.68517
[LOGITS Ex2 A] Mean Abs: 0.494 | Max: 2.574
[LOSS Ex2] A: 0.55821 | B: 0.55745 | C: 0.54856
** [JOINT LOSS] ** : 1.013628
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002707 | Grad Max: 0.063385
  -> Layer: shared_layers.0.bias | Grad Mean: 0.070725 | Grad Max: 0.346324
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001035 | Grad Max: 0.004639
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012606 | Grad Max: 0.012606
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000658 | Grad Max: 0.016227
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011917 | Grad Max: 0.075540
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000208 | Grad Max: 0.004619
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006158 | Grad Max: 0.022710
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.001188
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002330 | Grad Max: 0.006415
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000068 | Grad Max: 0.001029
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001330 | Grad Max: 0.004100
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005558 | Grad Max: 0.014280
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.059058 | Grad Max: 0.059058
[GRADIENT NORM TOTAL] 1.5213

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.104
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50437933 0.49562064] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.007
[MASKS] A(Pass/Fail): 0/1616 | B: 72/1976 | C: 207/1841
[LOSS Ex1] A: 0.00000 | B: 0.69153 | C: 0.68761
[LOGITS Ex2 A] Mean Abs: 0.493 | Max: 2.470
[LOSS Ex2] A: 0.55602 | B: 0.54570 | C: 0.55668
** [JOINT LOSS] ** : 1.012515
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001842 | Grad Max: 0.042785
  -> Layer: shared_layers.0.bias | Grad Mean: 0.042327 | Grad Max: 0.215639
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001094 | Grad Max: 0.004675
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017769 | Grad Max: 0.017769
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000404 | Grad Max: 0.009749
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007079 | Grad Max: 0.048611
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.002653
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003604 | Grad Max: 0.013408
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000724
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001360 | Grad Max: 0.003946
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000722
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000771 | Grad Max: 0.002671
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003245 | Grad Max: 0.009958
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034374 | Grad Max: 0.034374
[GRADIENT NORM TOTAL] 0.9127

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.166
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5185403  0.48145968] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 57/1991 | C: 218/1830
[LOSS Ex1] A: 0.68909 | B: 0.69021 | C: 0.68662
[LOGITS Ex2 A] Mean Abs: 0.498 | Max: 2.422
[LOSS Ex2] A: 0.54563 | B: 0.55751 | C: 0.55299
** [JOINT LOSS] ** : 1.240685
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002214 | Grad Max: 0.050774
  -> Layer: shared_layers.0.bias | Grad Mean: 0.065611 | Grad Max: 0.318099
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001930 | Grad Max: 0.010998
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022991 | Grad Max: 0.022991
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000546 | Grad Max: 0.013489
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010600 | Grad Max: 0.065460
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000175 | Grad Max: 0.004082
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005303 | Grad Max: 0.018863
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000977
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001975 | Grad Max: 0.005399
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000056 | Grad Max: 0.000879
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001112 | Grad Max: 0.003492
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004442 | Grad Max: 0.011297
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048192 | Grad Max: 0.048192
[GRADIENT NORM TOTAL] 1.3286

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.164
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5123551 0.4876449] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.007
[MASKS] A(Pass/Fail): 0/2048 | B: 72/1784 | C: 154/1222
[LOSS Ex1] A: 0.00000 | B: 0.69161 | C: 0.68545
[LOGITS Ex2 A] Mean Abs: 0.509 | Max: 2.426
[LOSS Ex2] A: 0.53863 | B: 0.54422 | C: 0.53839
** [JOINT LOSS] ** : 0.999431
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002632 | Grad Max: 0.064329
  -> Layer: shared_layers.0.bias | Grad Mean: 0.073531 | Grad Max: 0.367397
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001128 | Grad Max: 0.005063
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016704 | Grad Max: 0.016704
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000651 | Grad Max: 0.016345
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012403 | Grad Max: 0.079356
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000207 | Grad Max: 0.004879
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006203 | Grad Max: 0.021911
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.001151
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002308 | Grad Max: 0.006545
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000065 | Grad Max: 0.001069
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001296 | Grad Max: 0.004093
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005263 | Grad Max: 0.012831
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.056272 | Grad Max: 0.056272
[GRADIENT NORM TOTAL] 1.5297

[EPOCH SUMMARY] Train Loss: 1.1316

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.1825 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 5/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.178
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51820445 0.48179555] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.007
[MASKS] A(Pass/Fail): 2/2046 | B: 76/1972 | C: 223/1825
[LOSS Ex1] A: 0.68830 | B: 0.69133 | C: 0.68625
[LOGITS Ex2 A] Mean Abs: 0.480 | Max: 2.493
[LOSS Ex2] A: 0.54048 | B: 0.54965 | C: 0.53131
** [JOINT LOSS] ** : 1.229110
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001392 | Grad Max: 0.030426
  -> Layer: shared_layers.0.bias | Grad Mean: 0.007379 | Grad Max: 0.053582
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002031 | Grad Max: 0.011351
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022244 | Grad Max: 0.022244
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000123 | Grad Max: 0.003881
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001480 | Grad Max: 0.019029
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000022 | Grad Max: 0.000885
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000345 | Grad Max: 0.003198
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000223
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000097 | Grad Max: 0.000804
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000170
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000050 | Grad Max: 0.000458
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000326 | Grad Max: 0.001987
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000926 | Grad Max: 0.000926
[GRADIENT NORM TOTAL] 0.2323

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.163
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51902765 0.48097238] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 70/1978 | C: 226/1822
[LOSS Ex1] A: 0.68813 | B: 0.69137 | C: 0.68396
[LOGITS Ex2 A] Mean Abs: 0.496 | Max: 2.740
[LOSS Ex2] A: 0.55436 | B: 0.54192 | C: 0.54009
** [JOINT LOSS] ** : 1.233278
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002565 | Grad Max: 0.060624
  -> Layer: shared_layers.0.bias | Grad Mean: 0.077463 | Grad Max: 0.380189
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001668 | Grad Max: 0.009089
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009470 | Grad Max: 0.009470
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000691 | Grad Max: 0.017956
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012475 | Grad Max: 0.091461
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000213 | Grad Max: 0.004879
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006355 | Grad Max: 0.021875
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.001146
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002343 | Grad Max: 0.006617
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000066 | Grad Max: 0.001005
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001309 | Grad Max: 0.004148
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005508 | Grad Max: 0.013076
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.058357 | Grad Max: 0.058357
[GRADIENT NORM TOTAL] 1.6027

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.136
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078479 0.4921521] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 58/1990 | C: 206/1842
[LOSS Ex1] A: 0.00000 | B: 0.69005 | C: 0.68721
[LOGITS Ex2 A] Mean Abs: 0.486 | Max: 2.577
[LOSS Ex2] A: 0.54754 | B: 0.53051 | C: 0.52151
** [JOINT LOSS] ** : 0.992273
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001659 | Grad Max: 0.032489
  -> Layer: shared_layers.0.bias | Grad Mean: 0.044435 | Grad Max: 0.223910
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001207 | Grad Max: 0.004964
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019592 | Grad Max: 0.019592
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000395 | Grad Max: 0.010101
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006964 | Grad Max: 0.048802
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000115 | Grad Max: 0.003084
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003460 | Grad Max: 0.013326
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000741
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001272 | Grad Max: 0.004118
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000665
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000703 | Grad Max: 0.002374
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002987 | Grad Max: 0.007795
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031431 | Grad Max: 0.031431
[GRADIENT NORM TOTAL] 0.9060

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.153
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005552  0.49944478] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 69/1787 | C: 199/1849
[LOSS Ex1] A: 0.68942 | B: 0.69145 | C: 0.68551
[LOGITS Ex2 A] Mean Abs: 0.508 | Max: 2.530
[LOSS Ex2] A: 0.53781 | B: 0.53537 | C: 0.54862
** [JOINT LOSS] ** : 1.229398
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001999 | Grad Max: 0.041426
  -> Layer: shared_layers.0.bias | Grad Mean: 0.039534 | Grad Max: 0.215609
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001943 | Grad Max: 0.011132
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022018 | Grad Max: 0.022018
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000370 | Grad Max: 0.009865
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007002 | Grad Max: 0.053411
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000113 | Grad Max: 0.002721
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003349 | Grad Max: 0.012343
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000671
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001228 | Grad Max: 0.003901
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000603
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000679 | Grad Max: 0.002395
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002749 | Grad Max: 0.006975
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029070 | Grad Max: 0.029070
[GRADIENT NORM TOTAL] 0.8456

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.172
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5149608 0.4850392] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 3/2045 | B: 76/1972 | C: 197/1851
[LOSS Ex1] A: 0.68926 | B: 0.69117 | C: 0.68820
[LOGITS Ex2 A] Mean Abs: 0.516 | Max: 2.476
[LOSS Ex2] A: 0.54233 | B: 0.55872 | C: 0.54988
** [JOINT LOSS] ** : 1.239854
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001847 | Grad Max: 0.029661
  -> Layer: shared_layers.0.bias | Grad Mean: 0.056954 | Grad Max: 0.282872
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001877 | Grad Max: 0.010159
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.026383 | Grad Max: 0.026383
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000463 | Grad Max: 0.012564
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008913 | Grad Max: 0.064027
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000143 | Grad Max: 0.003219
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004345 | Grad Max: 0.014900
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000818
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001574 | Grad Max: 0.004405
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000764
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000869 | Grad Max: 0.002851
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003551 | Grad Max: 0.010100
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037620 | Grad Max: 0.037620
[GRADIENT NORM TOTAL] 1.1269

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.088
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50345844 0.49654156] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 69/1979 | C: 217/1831
[LOSS Ex1] A: 0.00000 | B: 0.69122 | C: 0.68647
[LOGITS Ex2 A] Mean Abs: 0.510 | Max: 2.624
[LOSS Ex2] A: 0.53575 | B: 0.53694 | C: 0.55044
** [JOINT LOSS] ** : 1.000271
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001301 | Grad Max: 0.026322
  -> Layer: shared_layers.0.bias | Grad Mean: 0.022319 | Grad Max: 0.129022
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001107 | Grad Max: 0.004770
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015646 | Grad Max: 0.015646
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000209 | Grad Max: 0.007134
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.003480 | Grad Max: 0.032073
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.001656
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001516 | Grad Max: 0.006819
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000314
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000551 | Grad Max: 0.001744
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000334
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000302 | Grad Max: 0.001197
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001266 | Grad Max: 0.003567
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013610 | Grad Max: 0.013610
[GRADIENT NORM TOTAL] 0.4583

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.104
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50323445 0.49676552] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/1616 | B: 57/1991 | C: 240/1808
[LOSS Ex1] A: 0.00000 | B: 0.68988 | C: 0.68536
[LOGITS Ex2 A] Mean Abs: 0.534 | Max: 2.748
[LOSS Ex2] A: 0.53978 | B: 0.52962 | C: 0.54905
** [JOINT LOSS] ** : 0.997897
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001810 | Grad Max: 0.029432
  -> Layer: shared_layers.0.bias | Grad Mean: 0.043142 | Grad Max: 0.207854
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001093 | Grad Max: 0.005224
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015054 | Grad Max: 0.015054
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000409 | Grad Max: 0.010749
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006899 | Grad Max: 0.058903
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000115 | Grad Max: 0.003248
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003382 | Grad Max: 0.014819
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000674
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001231 | Grad Max: 0.003422
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000557
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000681 | Grad Max: 0.002204
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002927 | Grad Max: 0.006929
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030691 | Grad Max: 0.030691
[GRADIENT NORM TOTAL] 0.8801

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.168
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5177475  0.48225248] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 69/1787 | C: 170/1878
[LOSS Ex1] A: 0.68870 | B: 0.69128 | C: 0.68741
[LOGITS Ex2 A] Mean Abs: 0.533 | Max: 2.698
[LOSS Ex2] A: 0.52806 | B: 0.52539 | C: 0.54068
** [JOINT LOSS] ** : 1.220505
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001407 | Grad Max: 0.025096
  -> Layer: shared_layers.0.bias | Grad Mean: 0.012316 | Grad Max: 0.083373
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001932 | Grad Max: 0.010945
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.024072 | Grad Max: 0.024072
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000119 | Grad Max: 0.005272
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001947 | Grad Max: 0.024667
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000030 | Grad Max: 0.001212
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000790 | Grad Max: 0.005084
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000279
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000278 | Grad Max: 0.001275
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000257
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000150 | Grad Max: 0.000836
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000552 | Grad Max: 0.002175
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006103 | Grad Max: 0.006103
[GRADIENT NORM TOTAL] 0.2865

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.166
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5117532  0.48824683] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 72/1976 | C: 211/1837
[LOSS Ex1] A: 0.00000 | B: 0.69100 | C: 0.68737
[LOGITS Ex2 A] Mean Abs: 0.528 | Max: 2.839
[LOSS Ex2] A: 0.53229 | B: 0.54058 | C: 0.55472
** [JOINT LOSS] ** : 1.001987
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001646 | Grad Max: 0.028318
  -> Layer: shared_layers.0.bias | Grad Mean: 0.027881 | Grad Max: 0.143415
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001137 | Grad Max: 0.004745
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018202 | Grad Max: 0.018202
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000235 | Grad Max: 0.009094
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004617 | Grad Max: 0.040848
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000071 | Grad Max: 0.001640
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002155 | Grad Max: 0.008160
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000404
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000776 | Grad Max: 0.002319
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000480
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000421 | Grad Max: 0.001525
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001631 | Grad Max: 0.004545
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018164 | Grad Max: 0.018164
[GRADIENT NORM TOTAL] 0.5671

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.178
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.517357   0.48264304] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 2/2046 | B: 67/1981 | C: 213/1835
[LOSS Ex1] A: 0.68791 | B: 0.69106 | C: 0.68671
[LOGITS Ex2 A] Mean Abs: 0.524 | Max: 2.723
[LOSS Ex2] A: 0.51980 | B: 0.52816 | C: 0.52973
** [JOINT LOSS] ** : 1.214457
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001393 | Grad Max: 0.025935
  -> Layer: shared_layers.0.bias | Grad Mean: 0.007804 | Grad Max: 0.080741
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001967 | Grad Max: 0.010728
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.020026 | Grad Max: 0.020026
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000104 | Grad Max: 0.007253
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001319 | Grad Max: 0.029377
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000020 | Grad Max: 0.001106
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000337 | Grad Max: 0.003620
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000174
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000100 | Grad Max: 0.000604
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000183
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000054 | Grad Max: 0.000456
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000443 | Grad Max: 0.001934
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000004 | Grad Max: 0.000004
[GRADIENT NORM TOTAL] 0.2304

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.162
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51819885 0.48180115] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 53/1995 | C: 182/1866
[LOSS Ex1] A: 0.68786 | B: 0.68971 | C: 0.68685
[LOGITS Ex2 A] Mean Abs: 0.525 | Max: 2.714
[LOSS Ex2] A: 0.53753 | B: 0.52880 | C: 0.52948
** [JOINT LOSS] ** : 1.220075
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001734 | Grad Max: 0.036972
  -> Layer: shared_layers.0.bias | Grad Mean: 0.024904 | Grad Max: 0.124580
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001629 | Grad Max: 0.009148
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011759 | Grad Max: 0.011759
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000266 | Grad Max: 0.007358
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004314 | Grad Max: 0.038450
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.001992
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002015 | Grad Max: 0.008547
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000519
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000731 | Grad Max: 0.002222
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000421
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000398 | Grad Max: 0.001500
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001793 | Grad Max: 0.005279
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018146 | Grad Max: 0.018146
[GRADIENT NORM TOTAL] 0.5517

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.136
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5069747  0.49302527] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 66/1790 | C: 224/1824
[LOSS Ex1] A: 0.00000 | B: 0.69111 | C: 0.68522
[LOGITS Ex2 A] Mean Abs: 0.510 | Max: 2.989
[LOSS Ex2] A: 0.53117 | B: 0.50943 | C: 0.50628
** [JOINT LOSS] ** : 0.974403
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001237 | Grad Max: 0.022786
  -> Layer: shared_layers.0.bias | Grad Mean: 0.014035 | Grad Max: 0.074890
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001093 | Grad Max: 0.005215
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013083 | Grad Max: 0.013083
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000162 | Grad Max: 0.006693
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002448 | Grad Max: 0.034534
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000038 | Grad Max: 0.001321
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001042 | Grad Max: 0.005002
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000316
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000378 | Grad Max: 0.001569
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000266
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000204 | Grad Max: 0.000830
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000974 | Grad Max: 0.003897
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009372 | Grad Max: 0.009372
[GRADIENT NORM TOTAL] 0.3323

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.155
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50258607 0.49741387] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 65/1983 | C: 211/1837
[LOSS Ex1] A: 0.68900 | B: 0.69084 | C: 0.68526
[LOGITS Ex2 A] Mean Abs: 0.538 | Max: 2.540
[LOSS Ex2] A: 0.52636 | B: 0.54178 | C: 0.53868
** [JOINT LOSS] ** : 1.223968
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001566 | Grad Max: 0.032212
  -> Layer: shared_layers.0.bias | Grad Mean: 0.038920 | Grad Max: 0.196193
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001948 | Grad Max: 0.010794
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.020778 | Grad Max: 0.020778
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000323 | Grad Max: 0.009536
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006227 | Grad Max: 0.046921
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000096 | Grad Max: 0.002522
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002908 | Grad Max: 0.010706
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000503
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001017 | Grad Max: 0.002874
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000490
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000554 | Grad Max: 0.001923
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002143 | Grad Max: 0.005652
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023606 | Grad Max: 0.023606
[GRADIENT NORM TOTAL] 0.7773

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.172
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51421845 0.48578158] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 65/1983 | C: 116/1260
[LOSS Ex1] A: 0.68893 | B: 0.69090 | C: 0.68609
[LOGITS Ex2 A] Mean Abs: 0.533 | Max: 2.935
[LOSS Ex2] A: 0.53475 | B: 0.52902 | C: 0.53377
** [JOINT LOSS] ** : 1.221152
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001288 | Grad Max: 0.023680
  -> Layer: shared_layers.0.bias | Grad Mean: 0.019156 | Grad Max: 0.079529
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001734 | Grad Max: 0.010109
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015869 | Grad Max: 0.015869
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000165 | Grad Max: 0.007734
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002964 | Grad Max: 0.044326
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.001484
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001200 | Grad Max: 0.005424
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000349
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000406 | Grad Max: 0.001926
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000258
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000222 | Grad Max: 0.000964
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000797 | Grad Max: 0.002606
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008944 | Grad Max: 0.008944
[GRADIENT NORM TOTAL] 0.3853

[EPOCH SUMMARY] Train Loss: 1.1428

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.1291 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.1823 -> New: 1.1291)

############################## EPOCH 6/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.088
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50534266 0.49465734] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 50/1998 | C: 218/1830
[LOSS Ex1] A: 0.00000 | B: 0.68955 | C: 0.68605
[LOGITS Ex2 A] Mean Abs: 0.532 | Max: 2.744
[LOSS Ex2] A: 0.51887 | B: 0.52301 | C: 0.52195
** [JOINT LOSS] ** : 0.979809
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001504 | Grad Max: 0.024302
  -> Layer: shared_layers.0.bias | Grad Mean: 0.024432 | Grad Max: 0.110647
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001114 | Grad Max: 0.005264
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014180 | Grad Max: 0.014180
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000258 | Grad Max: 0.007922
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004176 | Grad Max: 0.041863
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000067 | Grad Max: 0.001754
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001914 | Grad Max: 0.008169
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000468
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000688 | Grad Max: 0.002397
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000432
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000374 | Grad Max: 0.001487
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001634 | Grad Max: 0.004935
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016991 | Grad Max: 0.016991
[GRADIENT NORM TOTAL] 0.5179

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.104
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50204796 0.49795207] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/1616 | B: 62/1794 | C: 176/1872
[LOSS Ex1] A: 0.00000 | B: 0.69095 | C: 0.68787
[LOGITS Ex2 A] Mean Abs: 0.545 | Max: 2.733
[LOSS Ex2] A: 0.52642 | B: 0.51226 | C: 0.52560
** [JOINT LOSS] ** : 0.981033
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001465 | Grad Max: 0.024768
  -> Layer: shared_layers.0.bias | Grad Mean: 0.031608 | Grad Max: 0.147032
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001136 | Grad Max: 0.004784
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017543 | Grad Max: 0.017543
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000294 | Grad Max: 0.007647
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005052 | Grad Max: 0.038045
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.002148
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002369 | Grad Max: 0.010120
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000484
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000842 | Grad Max: 0.002649
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000511
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000454 | Grad Max: 0.002058
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001942 | Grad Max: 0.006403
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020587 | Grad Max: 0.020587
[GRADIENT NORM TOTAL] 0.6385

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.171
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51697916 0.4830209 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 62/1986 | C: 194/1854
[LOSS Ex1] A: 0.68831 | B: 0.69067 | C: 0.68681
[LOGITS Ex2 A] Mean Abs: 0.553 | Max: 2.669
[LOSS Ex2] A: 0.51743 | B: 0.53257 | C: 0.53365
** [JOINT LOSS] ** : 1.216480
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001815 | Grad Max: 0.043466
  -> Layer: shared_layers.0.bias | Grad Mean: 0.040945 | Grad Max: 0.217127
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.011166
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.025102 | Grad Max: 0.025102
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000365 | Grad Max: 0.010041
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007057 | Grad Max: 0.054933
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.002772
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003326 | Grad Max: 0.012134
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000566
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001159 | Grad Max: 0.003187
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000515
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000624 | Grad Max: 0.001930
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002515 | Grad Max: 0.006985
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026766 | Grad Max: 0.026766
[GRADIENT NORM TOTAL] 0.8544

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.170
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5111431 0.4888569] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 63/1985 | C: 230/1818
[LOSS Ex1] A: 0.00000 | B: 0.69074 | C: 0.68387
[LOGITS Ex2 A] Mean Abs: 0.542 | Max: 2.729
[LOSS Ex2] A: 0.51692 | B: 0.51935 | C: 0.52354
** [JOINT LOSS] ** : 0.978143
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001310 | Grad Max: 0.024196
  -> Layer: shared_layers.0.bias | Grad Mean: 0.009203 | Grad Max: 0.074947
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001065 | Grad Max: 0.005386
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009082 | Grad Max: 0.009082
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000109 | Grad Max: 0.008875
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001661 | Grad Max: 0.048021
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000026 | Grad Max: 0.001368
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000593 | Grad Max: 0.004878
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000318
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000187 | Grad Max: 0.001104
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000236
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000100 | Grad Max: 0.000644
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000481 | Grad Max: 0.002187
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003759 | Grad Max: 0.003759
[GRADIENT NORM TOTAL] 0.2543

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.178
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5165047 0.4834953] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 2/2046 | B: 50/1998 | C: 184/1864
[LOSS Ex1] A: 0.68752 | B: 0.68938 | C: 0.68212
[LOGITS Ex2 A] Mean Abs: 0.531 | Max: 2.774
[LOSS Ex2] A: 0.52017 | B: 0.51610 | C: 0.50755
** [JOINT LOSS] ** : 1.200948
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001576 | Grad Max: 0.023489
  -> Layer: shared_layers.0.bias | Grad Mean: 0.031091 | Grad Max: 0.143981
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002027 | Grad Max: 0.010495
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008827 | Grad Max: 0.008827
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000293 | Grad Max: 0.011097
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004732 | Grad Max: 0.046883
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.002117
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002177 | Grad Max: 0.009072
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000410
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000752 | Grad Max: 0.002236
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000393
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000406 | Grad Max: 0.001514
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001822 | Grad Max: 0.005220
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018876 | Grad Max: 0.018876
[GRADIENT NORM TOTAL] 0.6187

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.161
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51741415 0.48258588] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 63/1793 | C: 216/1832
[LOSS Ex1] A: 0.68758 | B: 0.69078 | C: 0.68455
[LOGITS Ex2 A] Mean Abs: 0.530 | Max: 2.836
[LOSS Ex2] A: 0.52892 | B: 0.50219 | C: 0.51742
** [JOINT LOSS] ** : 1.203812
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001387 | Grad Max: 0.028221
  -> Layer: shared_layers.0.bias | Grad Mean: 0.015110 | Grad Max: 0.074633
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001676 | Grad Max: 0.009343
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011110 | Grad Max: 0.011110
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000172 | Grad Max: 0.007887
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002604 | Grad Max: 0.033906
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.001546
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001122 | Grad Max: 0.006333
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000284
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000394 | Grad Max: 0.001445
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000333
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000212 | Grad Max: 0.001014
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001066 | Grad Max: 0.003673
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010485 | Grad Max: 0.010485
[GRADIENT NORM TOTAL] 0.3593

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.136
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.506132 0.493868] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 62/1986 | C: 184/1864
[LOSS Ex1] A: 0.00000 | B: 0.69050 | C: 0.68904
[LOGITS Ex2 A] Mean Abs: 0.533 | Max: 2.692
[LOSS Ex2] A: 0.51851 | B: 0.53078 | C: 0.50655
** [JOINT LOSS] ** : 0.978462
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001661 | Grad Max: 0.042217
  -> Layer: shared_layers.0.bias | Grad Mean: 0.052706 | Grad Max: 0.276055
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001530 | Grad Max: 0.006034
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.028762 | Grad Max: 0.028762
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000452 | Grad Max: 0.012068
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008541 | Grad Max: 0.066558
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.002860
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004000 | Grad Max: 0.013394
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000700
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001363 | Grad Max: 0.003615
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000577
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000725 | Grad Max: 0.002489
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002840 | Grad Max: 0.006864
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030730 | Grad Max: 0.030730
[GRADIENT NORM TOTAL] 1.0436

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.157
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50455683 0.49544317] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 63/1985 | C: 210/1838
[LOSS Ex1] A: 0.68857 | B: 0.69058 | C: 0.68616
[LOGITS Ex2 A] Mean Abs: 0.542 | Max: 2.933
[LOSS Ex2] A: 0.51831 | B: 0.51584 | C: 0.50813
** [JOINT LOSS] ** : 1.202531
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001241 | Grad Max: 0.021107
  -> Layer: shared_layers.0.bias | Grad Mean: 0.009940 | Grad Max: 0.048633
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002041 | Grad Max: 0.011028
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.024833 | Grad Max: 0.024833
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000133 | Grad Max: 0.006888
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001765 | Grad Max: 0.038353
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000027 | Grad Max: 0.001258
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000617 | Grad Max: 0.004247
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000322
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000204 | Grad Max: 0.001011
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000210
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000107 | Grad Max: 0.000690
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000703 | Grad Max: 0.003266
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005321 | Grad Max: 0.005321
[GRADIENT NORM TOTAL] 0.2583

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.173
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5135595  0.48644048] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 50/1998 | C: 162/1886
[LOSS Ex1] A: 0.68860 | B: 0.68921 | C: 0.68684
[LOGITS Ex2 A] Mean Abs: 0.547 | Max: 3.003
[LOSS Ex2] A: 0.53199 | B: 0.50768 | C: 0.52897
** [JOINT LOSS] ** : 1.211098
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.049757
  -> Layer: shared_layers.0.bias | Grad Mean: 0.033185 | Grad Max: 0.158825
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001761 | Grad Max: 0.009777
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019749 | Grad Max: 0.019749
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000352 | Grad Max: 0.009325
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005900 | Grad Max: 0.048400
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000095 | Grad Max: 0.002409
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002782 | Grad Max: 0.011746
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000511
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000954 | Grad Max: 0.002818
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000428
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000511 | Grad Max: 0.001712
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002235 | Grad Max: 0.005598
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022687 | Grad Max: 0.022687
[GRADIENT NORM TOTAL] 0.7205

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.087
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50723404 0.49276593] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 66/1790 | C: 177/1871
[LOSS Ex1] A: 0.00000 | B: 0.69062 | C: 0.68639
[LOGITS Ex2 A] Mean Abs: 0.552 | Max: 2.933
[LOSS Ex2] A: 0.50671 | B: 0.49487 | C: 0.51747
** [JOINT LOSS] ** : 0.965351
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001222 | Grad Max: 0.020312
  -> Layer: shared_layers.0.bias | Grad Mean: 0.007300 | Grad Max: 0.060675
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001045 | Grad Max: 0.004511
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015426 | Grad Max: 0.015426
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000114 | Grad Max: 0.006272
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001371 | Grad Max: 0.035143
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000023 | Grad Max: 0.001073
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000434 | Grad Max: 0.004028
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000241
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000136 | Grad Max: 0.001148
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000191
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000069 | Grad Max: 0.000440
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000686 | Grad Max: 0.002489
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002752 | Grad Max: 0.002752
[GRADIENT NORM TOTAL] 0.2242

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.104
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008671 0.4991329] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 0/1616 | B: 64/1984 | C: 205/1843
[LOSS Ex1] A: 0.00000 | B: 0.69033 | C: 0.68556
[LOGITS Ex2 A] Mean Abs: 0.579 | Max: 2.948
[LOSS Ex2] A: 0.50062 | B: 0.52490 | C: 0.52674
** [JOINT LOSS] ** : 0.976053
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001574 | Grad Max: 0.032614
  -> Layer: shared_layers.0.bias | Grad Mean: 0.025411 | Grad Max: 0.120497
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001072 | Grad Max: 0.004960
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013263 | Grad Max: 0.013263
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000239 | Grad Max: 0.011582
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004555 | Grad Max: 0.062425
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000067 | Grad Max: 0.001959
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002009 | Grad Max: 0.007686
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000410
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000679 | Grad Max: 0.002152
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000363
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000353 | Grad Max: 0.001473
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001350 | Grad Max: 0.004108
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014377 | Grad Max: 0.014377
[GRADIENT NORM TOTAL] 0.5421

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.174
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51625067 0.48374933] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 63/1985 | C: 192/1856
[LOSS Ex1] A: 0.68794 | B: 0.69042 | C: 0.68523
[LOGITS Ex2 A] Mean Abs: 0.574 | Max: 2.860
[LOSS Ex2] A: 0.51524 | B: 0.50739 | C: 0.51994
** [JOINT LOSS] ** : 1.202051
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001659 | Grad Max: 0.032891
  -> Layer: shared_layers.0.bias | Grad Mean: 0.033307 | Grad Max: 0.175288
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001833 | Grad Max: 0.010178
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016216 | Grad Max: 0.016216
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000322 | Grad Max: 0.010280
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005918 | Grad Max: 0.048324
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.002545
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002706 | Grad Max: 0.010407
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000515
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000903 | Grad Max: 0.002652
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000466
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000476 | Grad Max: 0.001786
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001811 | Grad Max: 0.004959
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019596 | Grad Max: 0.019596
[GRADIENT NORM TOTAL] 0.7036

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.173
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105901 0.4894099] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 50/1998 | C: 209/1839
[LOSS Ex1] A: 0.00000 | B: 0.68904 | C: 0.68522
[LOGITS Ex2 A] Mean Abs: 0.561 | Max: 2.874
[LOSS Ex2] A: 0.51521 | B: 0.50047 | C: 0.50763
** [JOINT LOSS] ** : 0.965858
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001507 | Grad Max: 0.024436
  -> Layer: shared_layers.0.bias | Grad Mean: 0.036664 | Grad Max: 0.171374
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001037 | Grad Max: 0.005243
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011228 | Grad Max: 0.011228
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000326 | Grad Max: 0.010578
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005621 | Grad Max: 0.055375
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.002047
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002660 | Grad Max: 0.009036
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000579
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000897 | Grad Max: 0.002561
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000415
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000468 | Grad Max: 0.001695
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001989 | Grad Max: 0.005744
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020598 | Grad Max: 0.020598
[GRADIENT NORM TOTAL] 0.7172

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.179
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5156869  0.48431307] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 66/1790 | C: 125/1251
[LOSS Ex1] A: 0.68713 | B: 0.69045 | C: 0.68487
[LOGITS Ex2 A] Mean Abs: 0.563 | Max: 3.009
[LOSS Ex2] A: 0.50511 | B: 0.49842 | C: 0.52784
** [JOINT LOSS] ** : 1.197941
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001366 | Grad Max: 0.027064
  -> Layer: shared_layers.0.bias | Grad Mean: 0.044123 | Grad Max: 0.216921
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001943 | Grad Max: 0.011184
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.020557 | Grad Max: 0.020557
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000376 | Grad Max: 0.010977
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006635 | Grad Max: 0.052783
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000101 | Grad Max: 0.002527
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003108 | Grad Max: 0.012447
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000571
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001037 | Grad Max: 0.003253
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000530
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000546 | Grad Max: 0.001870
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002140 | Grad Max: 0.005906
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023866 | Grad Max: 0.023866
[GRADIENT NORM TOTAL] 0.8545

[EPOCH SUMMARY] Train Loss: 1.0900

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.1453 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 7/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.163
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5166547  0.48334527] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 66/1982 | C: 193/1855
[LOSS Ex1] A: 0.68731 | B: 0.69017 | C: 0.68463
[LOGITS Ex2 A] Mean Abs: 0.572 | Max: 2.975
[LOSS Ex2] A: 0.50837 | B: 0.51307 | C: 0.51282
** [JOINT LOSS] ** : 1.198791
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001222 | Grad Max: 0.026620
  -> Layer: shared_layers.0.bias | Grad Mean: 0.024982 | Grad Max: 0.129007
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001640 | Grad Max: 0.009119
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010496 | Grad Max: 0.010496
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000216 | Grad Max: 0.013134
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004005 | Grad Max: 0.071642
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.001722
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001736 | Grad Max: 0.007027
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000400
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000570 | Grad Max: 0.001931
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000318
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000296 | Grad Max: 0.001316
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001069 | Grad Max: 0.002983
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011836 | Grad Max: 0.011836
[GRADIENT NORM TOTAL] 0.5000

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.135
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5052826  0.49471748] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 63/1985 | C: 190/1858
[LOSS Ex1] A: 0.00000 | B: 0.69026 | C: 0.68585
[LOGITS Ex2 A] Mean Abs: 0.571 | Max: 2.957
[LOSS Ex2] A: 0.51065 | B: 0.50141 | C: 0.51224
** [JOINT LOSS] ** : 0.966803
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001680 | Grad Max: 0.043896
  -> Layer: shared_layers.0.bias | Grad Mean: 0.047891 | Grad Max: 0.267380
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001059 | Grad Max: 0.005079
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012714 | Grad Max: 0.012714
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000428 | Grad Max: 0.013223
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008072 | Grad Max: 0.069017
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000123 | Grad Max: 0.002682
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003720 | Grad Max: 0.012898
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000588
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001213 | Grad Max: 0.003378
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000586
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000625 | Grad Max: 0.002188
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002389 | Grad Max: 0.005867
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025739 | Grad Max: 0.025739
[GRADIENT NORM TOTAL] 0.9687

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.160
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50645214 0.49354786] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 51/1997 | C: 196/1852
[LOSS Ex1] A: 0.68816 | B: 0.68887 | C: 0.68516
[LOGITS Ex2 A] Mean Abs: 0.586 | Max: 3.179
[LOSS Ex2] A: 0.50405 | B: 0.49673 | C: 0.50634
** [JOINT LOSS] ** : 1.189768
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001239 | Grad Max: 0.022194
  -> Layer: shared_layers.0.bias | Grad Mean: 0.011635 | Grad Max: 0.085967
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.011082
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.021818 | Grad Max: 0.021818
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000137 | Grad Max: 0.007208
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001864 | Grad Max: 0.035852
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000027 | Grad Max: 0.001233
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000680 | Grad Max: 0.004415
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000255
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000231 | Grad Max: 0.001186
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000216
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000124 | Grad Max: 0.000666
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000678 | Grad Max: 0.003018
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006113 | Grad Max: 0.006113
[GRADIENT NORM TOTAL] 0.2799

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.177
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5129169  0.48708302] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 66/1790 | C: 179/1869
[LOSS Ex1] A: 0.68828 | B: 0.69028 | C: 0.68543
[LOGITS Ex2 A] Mean Abs: 0.587 | Max: 3.082
[LOSS Ex2] A: 0.50889 | B: 0.49214 | C: 0.53162
** [JOINT LOSS] ** : 1.198883
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002462 | Grad Max: 0.070061
  -> Layer: shared_layers.0.bias | Grad Mean: 0.066738 | Grad Max: 0.349259
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001726 | Grad Max: 0.009808
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017326 | Grad Max: 0.017326
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000639 | Grad Max: 0.019292
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011404 | Grad Max: 0.099436
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000179 | Grad Max: 0.003691
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005437 | Grad Max: 0.018404
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000864
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001784 | Grad Max: 0.004580
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000654
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000921 | Grad Max: 0.002751
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003748 | Grad Max: 0.009266
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039598 | Grad Max: 0.039598
[GRADIENT NORM TOTAL] 1.3750

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5090695 0.4909305] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 66/1982 | C: 193/1855
[LOSS Ex1] A: 0.00000 | B: 0.69000 | C: 0.68719
[LOGITS Ex2 A] Mean Abs: 0.588 | Max: 3.094
[LOSS Ex2] A: 0.50313 | B: 0.50869 | C: 0.49909
** [JOINT LOSS] ** : 0.962699
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001373 | Grad Max: 0.022430
  -> Layer: shared_layers.0.bias | Grad Mean: 0.009645 | Grad Max: 0.068038
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001366 | Grad Max: 0.005781
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.024995 | Grad Max: 0.024995
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000137 | Grad Max: 0.009804
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001892 | Grad Max: 0.050665
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000028 | Grad Max: 0.001465
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000661 | Grad Max: 0.005323
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000242
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000220 | Grad Max: 0.001335
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000111 | Grad Max: 0.000727
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000593 | Grad Max: 0.003046
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004892 | Grad Max: 0.004892
[GRADIENT NORM TOTAL] 0.2832

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.104
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002838  0.49971628] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/1616 | B: 62/1986 | C: 185/1863
[LOSS Ex1] A: 0.00000 | B: 0.69010 | C: 0.68523
[LOGITS Ex2 A] Mean Abs: 0.633 | Max: 3.183
[LOSS Ex2] A: 0.49007 | B: 0.50930 | C: 0.50278
** [JOINT LOSS] ** : 0.959159
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001581 | Grad Max: 0.026183
  -> Layer: shared_layers.0.bias | Grad Mean: 0.045153 | Grad Max: 0.227735
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001102 | Grad Max: 0.005113
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012787 | Grad Max: 0.012787
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000390 | Grad Max: 0.016125
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007410 | Grad Max: 0.089090
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.003029
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003400 | Grad Max: 0.012536
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000589
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001078 | Grad Max: 0.003361
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000417
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000545 | Grad Max: 0.001677
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001929 | Grad Max: 0.004586
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021621 | Grad Max: 0.021621
[GRADIENT NORM TOTAL] 0.8842

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.178
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51558936 0.48441064] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 51/1997 | C: 189/1859
[LOSS Ex1] A: 0.68757 | B: 0.68870 | C: 0.68444
[LOGITS Ex2 A] Mean Abs: 0.612 | Max: 3.102
[LOSS Ex2] A: 0.50618 | B: 0.49837 | C: 0.50648
** [JOINT LOSS] ** : 1.190575
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001502 | Grad Max: 0.031811
  -> Layer: shared_layers.0.bias | Grad Mean: 0.038535 | Grad Max: 0.217347
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001861 | Grad Max: 0.010432
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016209 | Grad Max: 0.016209
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000324 | Grad Max: 0.017731
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006201 | Grad Max: 0.094083
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.002458
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002785 | Grad Max: 0.010429
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000538
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000885 | Grad Max: 0.002824
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000350
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000450 | Grad Max: 0.001491
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001601 | Grad Max: 0.004025
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017845 | Grad Max: 0.017845
[GRADIENT NORM TOTAL] 0.7619

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.177
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5100559  0.48994413] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 67/1789 | C: 179/1869
[LOSS Ex1] A: 0.68775 | B: 0.69012 | C: 0.68622
[LOGITS Ex2 A] Mean Abs: 0.604 | Max: 3.136
[LOSS Ex2] A: 0.49914 | B: 0.49513 | C: 0.48691
** [JOINT LOSS] ** : 1.181755
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001963 | Grad Max: 0.044163
  -> Layer: shared_layers.0.bias | Grad Mean: 0.057964 | Grad Max: 0.304553
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.011262
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.023109 | Grad Max: 0.023109
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000525 | Grad Max: 0.018794
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009486 | Grad Max: 0.098453
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000148 | Grad Max: 0.003358
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004557 | Grad Max: 0.015738
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000627
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001470 | Grad Max: 0.003815
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000586
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000747 | Grad Max: 0.002428
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002950 | Grad Max: 0.007537
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032059 | Grad Max: 0.032059
[GRADIENT NORM TOTAL] 1.1652

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.181
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51494133 0.48505864] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 66/1982 | C: 175/1873
[LOSS Ex1] A: 0.68674 | B: 0.68983 | C: 0.68639
[LOGITS Ex2 A] Mean Abs: 0.595 | Max: 3.159
[LOSS Ex2] A: 0.50041 | B: 0.50842 | C: 0.49749
** [JOINT LOSS] ** : 1.189760
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001371 | Grad Max: 0.036705
  -> Layer: shared_layers.0.bias | Grad Mean: 0.053471 | Grad Max: 0.276733
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.011418
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.021426 | Grad Max: 0.021426
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000443 | Grad Max: 0.014052
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007993 | Grad Max: 0.071995
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000123 | Grad Max: 0.002685
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003821 | Grad Max: 0.013537
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000581
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001213 | Grad Max: 0.003302
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000517
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000616 | Grad Max: 0.002098
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002436 | Grad Max: 0.006151
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026750 | Grad Max: 0.026750
[GRADIENT NORM TOTAL] 1.0363

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.166
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5159615 0.4840385] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 62/1986 | C: 182/1866
[LOSS Ex1] A: 0.68703 | B: 0.68994 | C: 0.68545
[LOGITS Ex2 A] Mean Abs: 0.613 | Max: 2.895
[LOSS Ex2] A: 0.49483 | B: 0.50625 | C: 0.50717
** [JOINT LOSS] ** : 1.190224
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001954 | Grad Max: 0.042219
  -> Layer: shared_layers.0.bias | Grad Mean: 0.071472 | Grad Max: 0.401655
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001696 | Grad Max: 0.009181
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014394 | Grad Max: 0.014394
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000624 | Grad Max: 0.018347
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011893 | Grad Max: 0.102828
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000182 | Grad Max: 0.004153
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005606 | Grad Max: 0.018722
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000781
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001763 | Grad Max: 0.004806
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000711
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000885 | Grad Max: 0.002850
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003294 | Grad Max: 0.007399
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035928 | Grad Max: 0.035928
[GRADIENT NORM TOTAL] 1.4308

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.135
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50444365 0.4955564 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 52/1996 | C: 160/1888
[LOSS Ex1] A: 0.00000 | B: 0.68852 | C: 0.68578
[LOGITS Ex2 A] Mean Abs: 0.588 | Max: 3.021
[LOSS Ex2] A: 0.50447 | B: 0.49194 | C: 0.49586
** [JOINT LOSS] ** : 0.955524
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001936 | Grad Max: 0.044187
  -> Layer: shared_layers.0.bias | Grad Mean: 0.054867 | Grad Max: 0.285694
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001067 | Grad Max: 0.005091
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011222 | Grad Max: 0.011222
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000507 | Grad Max: 0.016042
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009455 | Grad Max: 0.084474
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000142 | Grad Max: 0.003195
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004291 | Grad Max: 0.015217
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000643
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001347 | Grad Max: 0.003747
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000504
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000678 | Grad Max: 0.002153
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002516 | Grad Max: 0.005919
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027413 | Grad Max: 0.027413
[GRADIENT NORM TOTAL] 1.1047

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.163
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50835836 0.49164167] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 68/1788 | C: 208/1840
[LOSS Ex1] A: 0.68773 | B: 0.68995 | C: 0.68253
[LOGITS Ex2 A] Mean Abs: 0.606 | Max: 3.220
[LOSS Ex2] A: 0.50201 | B: 0.48693 | C: 0.47924
** [JOINT LOSS] ** : 1.176134
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001582 | Grad Max: 0.036885
  -> Layer: shared_layers.0.bias | Grad Mean: 0.051659 | Grad Max: 0.280329
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001893 | Grad Max: 0.010370
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015015 | Grad Max: 0.015015
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000463 | Grad Max: 0.015685
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008471 | Grad Max: 0.087075
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000130 | Grad Max: 0.003649
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004004 | Grad Max: 0.018398
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000545
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001262 | Grad Max: 0.003371
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000471
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000634 | Grad Max: 0.001970
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002516 | Grad Max: 0.006560
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027149 | Grad Max: 0.027149
[GRADIENT NORM TOTAL] 1.0450

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.181
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5123 0.4877] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 67/1981 | C: 199/1849
[LOSS Ex1] A: 0.68794 | B: 0.68966 | C: 0.68361
[LOGITS Ex2 A] Mean Abs: 0.605 | Max: 3.082
[LOSS Ex2] A: 0.49829 | B: 0.51394 | C: 0.48242
** [JOINT LOSS] ** : 1.185288
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.053429
  -> Layer: shared_layers.0.bias | Grad Mean: 0.042927 | Grad Max: 0.220794
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001744 | Grad Max: 0.009463
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014881 | Grad Max: 0.014881
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000443 | Grad Max: 0.014232
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007623 | Grad Max: 0.070951
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000116 | Grad Max: 0.003213
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003520 | Grad Max: 0.014701
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000526
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001120 | Grad Max: 0.002974
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000510
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000561 | Grad Max: 0.001933
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002203 | Grad Max: 0.005778
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023724 | Grad Max: 0.023724
[GRADIENT NORM TOTAL] 0.8994

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5109639  0.48903605] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 62/1986 | C: 108/1268
[LOSS Ex1] A: 0.00000 | B: 0.68978 | C: 0.68594
[LOGITS Ex2 A] Mean Abs: 0.634 | Max: 3.071
[LOSS Ex2] A: 0.49336 | B: 0.49512 | C: 0.51424
** [JOINT LOSS] ** : 0.959479
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001649 | Grad Max: 0.034500
  -> Layer: shared_layers.0.bias | Grad Mean: 0.052616 | Grad Max: 0.294213
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001092 | Grad Max: 0.004875
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016009 | Grad Max: 0.016009
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000450 | Grad Max: 0.017203
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008520 | Grad Max: 0.095446
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000129 | Grad Max: 0.003364
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003944 | Grad Max: 0.013906
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000586
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001218 | Grad Max: 0.003567
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000444
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000608 | Grad Max: 0.001940
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002339 | Grad Max: 0.005538
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024794 | Grad Max: 0.024794
[GRADIENT NORM TOTAL] 1.0374

[EPOCH SUMMARY] Train Loss: 1.1075

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.1628 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 8/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.104
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014596 0.4985404] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/1616 | B: 53/1995 | C: 202/1846
[LOSS Ex1] A: 0.00000 | B: 0.68835 | C: 0.68522
[LOGITS Ex2 A] Mean Abs: 0.662 | Max: 3.136
[LOSS Ex2] A: 0.48063 | B: 0.49216 | C: 0.49440
** [JOINT LOSS] ** : 0.946921
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001281 | Grad Max: 0.028011
  -> Layer: shared_layers.0.bias | Grad Mean: 0.031994 | Grad Max: 0.170667
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001090 | Grad Max: 0.005328
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011797 | Grad Max: 0.011797
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000270 | Grad Max: 0.018238
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005030 | Grad Max: 0.099358
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.002314
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002199 | Grad Max: 0.008153
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000422
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000674 | Grad Max: 0.002407
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000295
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000333 | Grad Max: 0.001219
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001077 | Grad Max: 0.003396
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012529 | Grad Max: 0.012529
[GRADIENT NORM TOTAL] 0.6271

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.182
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51497704 0.48502293] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 69/1787 | C: 192/1856
[LOSS Ex1] A: 0.68718 | B: 0.68979 | C: 0.68401
[LOGITS Ex2 A] Mean Abs: 0.622 | Max: 3.213
[LOSS Ex2] A: 0.48330 | B: 0.48295 | C: 0.48954
** [JOINT LOSS] ** : 1.172257
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001308 | Grad Max: 0.022284
  -> Layer: shared_layers.0.bias | Grad Mean: 0.022813 | Grad Max: 0.105789
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001834 | Grad Max: 0.010660
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016261 | Grad Max: 0.016261
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000234 | Grad Max: 0.010593
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004003 | Grad Max: 0.049222
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000059 | Grad Max: 0.001733
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001776 | Grad Max: 0.007097
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000374
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000551 | Grad Max: 0.001747
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000278
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000272 | Grad Max: 0.001141
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001100 | Grad Max: 0.003844
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011973 | Grad Max: 0.011973
[GRADIENT NORM TOTAL] 0.4919

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.181
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5095605  0.49043947] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 68/1980 | C: 170/1878
[LOSS Ex1] A: 0.68733 | B: 0.68949 | C: 0.68494
[LOGITS Ex2 A] Mean Abs: 0.623 | Max: 3.363
[LOSS Ex2] A: 0.49127 | B: 0.50046 | C: 0.52903
** [JOINT LOSS] ** : 1.194175
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001537 | Grad Max: 0.040065
  -> Layer: shared_layers.0.bias | Grad Mean: 0.045669 | Grad Max: 0.247488
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001979 | Grad Max: 0.011131
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019835 | Grad Max: 0.019835
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000417 | Grad Max: 0.016754
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007281 | Grad Max: 0.083930
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.002887
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003318 | Grad Max: 0.011793
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000496
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001020 | Grad Max: 0.002941
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000434
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000511 | Grad Max: 0.001787
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002014 | Grad Max: 0.004731
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022336 | Grad Max: 0.022336
[GRADIENT NORM TOTAL] 0.9169

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.185
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5142704  0.48572958] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 62/1986 | C: 203/1845
[LOSS Ex1] A: 0.68632 | B: 0.68961 | C: 0.68397
[LOGITS Ex2 A] Mean Abs: 0.640 | Max: 3.064
[LOSS Ex2] A: 0.49366 | B: 0.48732 | C: 0.48080
** [JOINT LOSS] ** : 1.173897
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001788 | Grad Max: 0.051340
  -> Layer: shared_layers.0.bias | Grad Mean: 0.035261 | Grad Max: 0.196354
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001901 | Grad Max: 0.010620
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015471 | Grad Max: 0.015471
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000370 | Grad Max: 0.013684
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006649 | Grad Max: 0.072739
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000101 | Grad Max: 0.002562
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002996 | Grad Max: 0.011374
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000504
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000922 | Grad Max: 0.002904
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000364
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000454 | Grad Max: 0.001368
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001611 | Grad Max: 0.004018
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017602 | Grad Max: 0.017602
[GRADIENT NORM TOTAL] 0.7762

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.169
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51535636 0.4846436 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 2/2046 | B: 53/1995 | C: 167/1881
[LOSS Ex1] A: 0.68673 | B: 0.68818 | C: 0.68448
[LOGITS Ex2 A] Mean Abs: 0.641 | Max: 3.258
[LOSS Ex2] A: 0.48576 | B: 0.48379 | C: 0.49133
** [JOINT LOSS] ** : 1.173422
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001243 | Grad Max: 0.026636
  -> Layer: shared_layers.0.bias | Grad Mean: 0.027474 | Grad Max: 0.140770
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001733 | Grad Max: 0.009038
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007408 | Grad Max: 0.007408
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000247 | Grad Max: 0.013958
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004284 | Grad Max: 0.074683
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000053 | Grad Max: 0.002091
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001570 | Grad Max: 0.008109
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000325
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000460 | Grad Max: 0.001888
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000248
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000220 | Grad Max: 0.000966
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000648 | Grad Max: 0.002650
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007781 | Grad Max: 0.007781
[GRADIENT NORM TOTAL] 0.5283

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.136
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5036633  0.49633673] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 69/1787 | C: 184/1864
[LOSS Ex1] A: 0.00000 | B: 0.68962 | C: 0.68486
[LOGITS Ex2 A] Mean Abs: 0.604 | Max: 3.159
[LOSS Ex2] A: 0.48655 | B: 0.47878 | C: 0.48689
** [JOINT LOSS] ** : 0.942236
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001444 | Grad Max: 0.034419
  -> Layer: shared_layers.0.bias | Grad Mean: 0.032541 | Grad Max: 0.179268
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001141 | Grad Max: 0.005096
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015473 | Grad Max: 0.015473
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000324 | Grad Max: 0.011393
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005725 | Grad Max: 0.062115
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.002297
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002519 | Grad Max: 0.009383
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000390
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000787 | Grad Max: 0.002366
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000318
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000388 | Grad Max: 0.001387
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001561 | Grad Max: 0.004785
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016957 | Grad Max: 0.016957
[GRADIENT NORM TOTAL] 0.6806

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.167
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.510231   0.48976895] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 69/1979 | C: 190/1858
[LOSS Ex1] A: 0.68728 | B: 0.68932 | C: 0.68446
[LOGITS Ex2 A] Mean Abs: 0.648 | Max: 3.216
[LOSS Ex2] A: 0.48450 | B: 0.49997 | C: 0.49178
** [JOINT LOSS] ** : 1.179105
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001100 | Grad Max: 0.017085
  -> Layer: shared_layers.0.bias | Grad Mean: 0.007199 | Grad Max: 0.040773
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002035 | Grad Max: 0.010743
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022081 | Grad Max: 0.022081
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000121 | Grad Max: 0.016456
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001697 | Grad Max: 0.090866
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000023 | Grad Max: 0.001317
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000447 | Grad Max: 0.004513
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000181
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000113 | Grad Max: 0.000889
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000137
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000055 | Grad Max: 0.000490
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000310 | Grad Max: 0.001906
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001914 | Grad Max: 0.001914
[GRADIENT NORM TOTAL] 0.2546

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.186
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5117834  0.48821658] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 62/1986 | C: 155/1893
[LOSS Ex1] A: 0.68758 | B: 0.68945 | C: 0.68676
[LOGITS Ex2 A] Mean Abs: 0.651 | Max: 3.346
[LOSS Ex2] A: 0.48327 | B: 0.48818 | C: 0.47961
** [JOINT LOSS] ** : 1.171619
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001312 | Grad Max: 0.024286
  -> Layer: shared_layers.0.bias | Grad Mean: 0.024407 | Grad Max: 0.117108
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001782 | Grad Max: 0.009847
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019358 | Grad Max: 0.019358
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000216 | Grad Max: 0.016022
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.003588 | Grad Max: 0.081661
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.001729
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001316 | Grad Max: 0.006399
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000224
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000371 | Grad Max: 0.001341
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000207
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000177 | Grad Max: 0.000865
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000620 | Grad Max: 0.002684
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006071 | Grad Max: 0.006071
[GRADIENT NORM TOTAL] 0.4623

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5128648 0.4871352] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.004
[MASKS] A(Pass/Fail): 0/2048 | B: 56/1992 | C: 175/1873
[LOSS Ex1] A: 0.00000 | B: 0.68799 | C: 0.68488
[LOGITS Ex2 A] Mean Abs: 0.649 | Max: 3.266
[LOSS Ex2] A: 0.47169 | B: 0.47146 | C: 0.47290
** [JOINT LOSS] ** : 0.929640
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001094 | Grad Max: 0.019162
  -> Layer: shared_layers.0.bias | Grad Mean: 0.010405 | Grad Max: 0.062593
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001132 | Grad Max: 0.005233
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013734 | Grad Max: 0.013734
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000130 | Grad Max: 0.019016
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001840 | Grad Max: 0.106280
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000023 | Grad Max: 0.001097
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000433 | Grad Max: 0.003597
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000220
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000112 | Grad Max: 0.000923
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000168
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000053 | Grad Max: 0.000485
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000800 | Grad Max: 0.002571
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000533 | Grad Max: 0.000533
[GRADIENT NORM TOTAL] 0.2822

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.105
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5027151  0.49728492] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/1616 | B: 70/1786 | C: 172/1876
[LOSS Ex1] A: 0.00000 | B: 0.68945 | C: 0.68462
[LOGITS Ex2 A] Mean Abs: 0.670 | Max: 3.235
[LOSS Ex2] A: 0.46666 | B: 0.46988 | C: 0.46928
** [JOINT LOSS] ** : 0.926626
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001393 | Grad Max: 0.023929
  -> Layer: shared_layers.0.bias | Grad Mean: 0.031077 | Grad Max: 0.159449
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001140 | Grad Max: 0.005123
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015619 | Grad Max: 0.015619
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000301 | Grad Max: 0.012273
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005208 | Grad Max: 0.055818
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.002086
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002362 | Grad Max: 0.008478
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000408
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000719 | Grad Max: 0.002429
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000330
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000351 | Grad Max: 0.001432
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001376 | Grad Max: 0.005037
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015139 | Grad Max: 0.015139
[GRADIENT NORM TOTAL] 0.6396

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.187
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51444244 0.48555753] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 71/1977 | C: 179/1869
[LOSS Ex1] A: 0.68675 | B: 0.68914 | C: 0.68450
[LOGITS Ex2 A] Mean Abs: 0.682 | Max: 3.440
[LOSS Ex2] A: 0.47844 | B: 0.49233 | C: 0.47484
** [JOINT LOSS] ** : 1.168669
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.054020
  -> Layer: shared_layers.0.bias | Grad Mean: 0.053721 | Grad Max: 0.305412
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001928 | Grad Max: 0.010456
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019958 | Grad Max: 0.019958
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000501 | Grad Max: 0.018563
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009281 | Grad Max: 0.103803
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.003335
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004247 | Grad Max: 0.015061
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000574
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001248 | Grad Max: 0.003313
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000458
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000597 | Grad Max: 0.001832
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002152 | Grad Max: 0.004679
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023331 | Grad Max: 0.023331
[GRADIENT NORM TOTAL] 1.0954

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.186
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5090493 0.4909507] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.004
[MASKS] A(Pass/Fail): 1/2047 | B: 62/1986 | C: 165/1883
[LOSS Ex1] A: 0.68687 | B: 0.68927 | C: 0.68581
[LOGITS Ex2 A] Mean Abs: 0.664 | Max: 3.373
[LOSS Ex2] A: 0.46922 | B: 0.48119 | C: 0.49412
** [JOINT LOSS] ** : 1.168828
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001204 | Grad Max: 0.028986
  -> Layer: shared_layers.0.bias | Grad Mean: 0.031490 | Grad Max: 0.170629
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.010832
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022262 | Grad Max: 0.022262
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000301 | Grad Max: 0.010834
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005158 | Grad Max: 0.060355
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.001832
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002252 | Grad Max: 0.008203
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000362
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000675 | Grad Max: 0.002232
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000255
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000324 | Grad Max: 0.001086
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001246 | Grad Max: 0.003679
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014061 | Grad Max: 0.014061
[GRADIENT NORM TOTAL] 0.6440

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.190
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5136271  0.48637292] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 58/1990 | C: 182/1866
[LOSS Ex1] A: 0.68588 | B: 0.68780 | C: 0.68430
[LOGITS Ex2 A] Mean Abs: 0.657 | Max: 3.327
[LOSS Ex2] A: 0.47414 | B: 0.46875 | C: 0.47867
** [JOINT LOSS] ** : 1.159848
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001656 | Grad Max: 0.032548
  -> Layer: shared_layers.0.bias | Grad Mean: 0.022686 | Grad Max: 0.115095
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002013 | Grad Max: 0.010869
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016173 | Grad Max: 0.016173
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000253 | Grad Max: 0.015982
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.003576 | Grad Max: 0.085864
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.001574
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001197 | Grad Max: 0.007277
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000242
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000338 | Grad Max: 0.001266
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000219
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000163 | Grad Max: 0.000840
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000732 | Grad Max: 0.003101
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007667 | Grad Max: 0.007667
[GRADIENT NORM TOTAL] 0.4790

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.173
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5147493  0.48525068] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 2/2046 | B: 72/1784 | C: 143/1233
[LOSS Ex1] A: 0.68641 | B: 0.68926 | C: 0.68260
[LOGITS Ex2 A] Mean Abs: 0.661 | Max: 3.295
[LOSS Ex2] A: 0.47782 | B: 0.47439 | C: 0.48361
** [JOINT LOSS] ** : 1.164697
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001144 | Grad Max: 0.022514
  -> Layer: shared_layers.0.bias | Grad Mean: 0.014887 | Grad Max: 0.062222
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001728 | Grad Max: 0.009186
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008740 | Grad Max: 0.008740
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000157 | Grad Max: 0.014752
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002628 | Grad Max: 0.079948
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000034 | Grad Max: 0.001680
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000873 | Grad Max: 0.006229
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000325
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000236 | Grad Max: 0.001369
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000196
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000111 | Grad Max: 0.000577
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000485 | Grad Max: 0.002209
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003823 | Grad Max: 0.003823
[GRADIENT NORM TOTAL] 0.3359

[EPOCH SUMMARY] Train Loss: 1.1051

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.1435 | Alpha: 0.5500
No improve count: 3/15

############################## EPOCH 9/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.137
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5027774 0.4972226] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.004
[MASKS] A(Pass/Fail): 0/2048 | B: 74/1974 | C: 202/1846
[LOSS Ex1] A: 0.00000 | B: 0.68896 | C: 0.68374
[LOGITS Ex2 A] Mean Abs: 0.655 | Max: 3.255
[LOSS Ex2] A: 0.46883 | B: 0.49422 | C: 0.46354
** [JOINT LOSS] ** : 0.933097
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001111 | Grad Max: 0.026452
  -> Layer: shared_layers.0.bias | Grad Mean: 0.010104 | Grad Max: 0.066075
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001162 | Grad Max: 0.005383
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013552 | Grad Max: 0.013552
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000127 | Grad Max: 0.011984
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.001860 | Grad Max: 0.068349
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000024 | Grad Max: 0.001347
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000477 | Grad Max: 0.005588
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000220
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000111 | Grad Max: 0.000948
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000165
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000052 | Grad Max: 0.000435
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000491 | Grad Max: 0.001696
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000122 | Grad Max: 0.000122
[GRADIENT NORM TOTAL] 0.2631

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.171
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122877 0.4877123] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.004
[MASKS] A(Pass/Fail): 1/2047 | B: 64/1984 | C: 192/1856
[LOSS Ex1] A: 0.68682 | B: 0.68909 | C: 0.68516
[LOGITS Ex2 A] Mean Abs: 0.694 | Max: 3.402
[LOSS Ex2] A: 0.47415 | B: 0.48372 | C: 0.48136
** [JOINT LOSS] ** : 1.166768
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001194 | Grad Max: 0.026759
  -> Layer: shared_layers.0.bias | Grad Mean: 0.021748 | Grad Max: 0.134026
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.010736
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022208 | Grad Max: 0.022208
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000207 | Grad Max: 0.020800
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.003633 | Grad Max: 0.115252
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.001743
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001421 | Grad Max: 0.007100
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000284
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000395 | Grad Max: 0.001468
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000207
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000180 | Grad Max: 0.000831
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000506 | Grad Max: 0.002078
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006018 | Grad Max: 0.006018
[GRADIENT NORM TOTAL] 0.4710

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.191
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5111625  0.48883748] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 62/1986 | C: 181/1867
[LOSS Ex1] A: 0.68723 | B: 0.68761 | C: 0.68385
[LOGITS Ex2 A] Mean Abs: 0.679 | Max: 3.656
[LOSS Ex2] A: 0.47307 | B: 0.47886 | C: 0.49758
** [JOINT LOSS] ** : 1.169400
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002018 | Grad Max: 0.060058
  -> Layer: shared_layers.0.bias | Grad Mean: 0.020374 | Grad Max: 0.080368
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001766 | Grad Max: 0.009279
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011004 | Grad Max: 0.011004
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000292 | Grad Max: 0.016634
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004606 | Grad Max: 0.075836
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.002079
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001788 | Grad Max: 0.008455
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000335
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000528 | Grad Max: 0.001813
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000257
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000251 | Grad Max: 0.000930
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001123 | Grad Max: 0.003302
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011497 | Grad Max: 0.011497
[GRADIENT NORM TOTAL] 0.5580

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51477134 0.4852286 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.004
[MASKS] A(Pass/Fail): 0/2048 | B: 73/1783 | C: 183/1865
[LOSS Ex1] A: 0.00000 | B: 0.68908 | C: 0.68472
[LOGITS Ex2 A] Mean Abs: 0.692 | Max: 3.349
[LOSS Ex2] A: 0.46862 | B: 0.46493 | C: 0.48307
** [JOINT LOSS] ** : 0.930140
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001559 | Grad Max: 0.037249
  -> Layer: shared_layers.0.bias | Grad Mean: 0.012560 | Grad Max: 0.057478
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001150 | Grad Max: 0.005001
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017278 | Grad Max: 0.017278
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000186 | Grad Max: 0.018937
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002428 | Grad Max: 0.104345
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000030 | Grad Max: 0.001374
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000524 | Grad Max: 0.005585
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000246
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000135 | Grad Max: 0.000846
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000154
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000061 | Grad Max: 0.000451
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000497 | Grad Max: 0.002270
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002091 | Grad Max: 0.002091
[GRADIENT NORM TOTAL] 0.3650

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.106
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50404954 0.49595043] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.004
[MASKS] A(Pass/Fail): 0/1616 | B: 74/1974 | C: 187/1861
[LOSS Ex1] A: 0.00000 | B: 0.68877 | C: 0.68429
[LOGITS Ex2 A] Mean Abs: 0.724 | Max: 3.327
[LOSS Ex2] A: 0.46005 | B: 0.49160 | C: 0.48888
** [JOINT LOSS] ** : 0.937865
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001127 | Grad Max: 0.023882
  -> Layer: shared_layers.0.bias | Grad Mean: 0.016781 | Grad Max: 0.108778
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001085 | Grad Max: 0.004985
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011972 | Grad Max: 0.011972
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000170 | Grad Max: 0.013183
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002926 | Grad Max: 0.074350
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000040 | Grad Max: 0.002027
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001083 | Grad Max: 0.009681
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000268
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000287 | Grad Max: 0.001446
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000238
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000130 | Grad Max: 0.000880
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000367 | Grad Max: 0.001758
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004081 | Grad Max: 0.004081
[GRADIENT NORM TOTAL] 0.3700

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.192
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5138496 0.4861504] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.004
[MASKS] A(Pass/Fail): 1/2047 | B: 65/1983 | C: 189/1859
[LOSS Ex1] A: 0.68635 | B: 0.68891 | C: 0.68549
[LOGITS Ex2 A] Mean Abs: 0.718 | Max: 3.439
[LOSS Ex2] A: 0.47115 | B: 0.46979 | C: 0.47567
** [JOINT LOSS] ** : 1.159119
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001804 | Grad Max: 0.047926
  -> Layer: shared_layers.0.bias | Grad Mean: 0.043314 | Grad Max: 0.206790
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001806 | Grad Max: 0.009755
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016387 | Grad Max: 0.016387
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000418 | Grad Max: 0.015573
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007490 | Grad Max: 0.084449
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.003004
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003255 | Grad Max: 0.012216
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000490
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000929 | Grad Max: 0.002720
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000346
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000426 | Grad Max: 0.001447
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001422 | Grad Max: 0.003888
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015619 | Grad Max: 0.015619
[GRADIENT NORM TOTAL] 0.8769

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.191
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084026  0.49159738] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.004
[MASKS] A(Pass/Fail): 1/2047 | B: 63/1985 | C: 186/1862
[LOSS Ex1] A: 0.68643 | B: 0.68741 | C: 0.68346
[LOGITS Ex2 A] Mean Abs: 0.694 | Max: 3.552
[LOSS Ex2] A: 0.47876 | B: 0.46879 | C: 0.49046
** [JOINT LOSS] ** : 1.165101
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002340 | Grad Max: 0.059492
  -> Layer: shared_layers.0.bias | Grad Mean: 0.093376 | Grad Max: 0.563302
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002012 | Grad Max: 0.010911
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017346 | Grad Max: 0.017346
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000881 | Grad Max: 0.035841
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016203 | Grad Max: 0.189919
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000249 | Grad Max: 0.004881
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007829 | Grad Max: 0.024360
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000845
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002230 | Grad Max: 0.005557
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000651
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001036 | Grad Max: 0.003085
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003767 | Grad Max: 0.008241
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042664 | Grad Max: 0.042664
[GRADIENT NORM TOTAL] 1.9353

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.195
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5129538  0.48704612] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 75/1781 | C: 200/1848
[LOSS Ex1] A: 0.68546 | B: 0.68889 | C: 0.68272
[LOGITS Ex2 A] Mean Abs: 0.687 | Max: 3.395
[LOSS Ex2] A: 0.46267 | B: 0.46106 | C: 0.47111
** [JOINT LOSS] ** : 1.150635
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001714 | Grad Max: 0.047814
  -> Layer: shared_layers.0.bias | Grad Mean: 0.073308 | Grad Max: 0.432125
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001981 | Grad Max: 0.010772
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014062 | Grad Max: 0.014062
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000657 | Grad Max: 0.024359
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011707 | Grad Max: 0.134496
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000172 | Grad Max: 0.003845
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005429 | Grad Max: 0.016937
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000614
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001530 | Grad Max: 0.004014
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000488
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000706 | Grad Max: 0.002386
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002468 | Grad Max: 0.007572
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029306 | Grad Max: 0.029306
[GRADIENT NORM TOTAL] 1.4633

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.176
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5141448  0.48585525] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 2/2046 | B: 74/1974 | C: 192/1856
[LOSS Ex1] A: 0.68611 | B: 0.68858 | C: 0.68256
[LOGITS Ex2 A] Mean Abs: 0.734 | Max: 3.529
[LOSS Ex2] A: 0.46413 | B: 0.49553 | C: 0.46041
** [JOINT LOSS] ** : 1.159107
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002760 | Grad Max: 0.070414
  -> Layer: shared_layers.0.bias | Grad Mean: 0.087343 | Grad Max: 0.516504
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001718 | Grad Max: 0.008749
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005541 | Grad Max: 0.005541
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000814 | Grad Max: 0.032581
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015247 | Grad Max: 0.179694
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.004707
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006912 | Grad Max: 0.023590
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000741
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001963 | Grad Max: 0.005083
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000546
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000898 | Grad Max: 0.002672
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003036 | Grad Max: 0.006513
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034935 | Grad Max: 0.034935
[GRADIENT NORM TOTAL] 1.7703

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.138
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50181407 0.4981859 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.004
[MASKS] A(Pass/Fail): 0/2048 | B: 67/1981 | C: 173/1875
[LOSS Ex1] A: 0.00000 | B: 0.68873 | C: 0.68415
[LOGITS Ex2 A] Mean Abs: 0.702 | Max: 3.346
[LOSS Ex2] A: 0.46875 | B: 0.47718 | C: 0.45603
** [JOINT LOSS] ** : 0.924949
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002791 | Grad Max: 0.074316
  -> Layer: shared_layers.0.bias | Grad Mean: 0.085719 | Grad Max: 0.536801
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001099 | Grad Max: 0.005114
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011510 | Grad Max: 0.011510
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000838 | Grad Max: 0.030582
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015447 | Grad Max: 0.169977
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.004361
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007086 | Grad Max: 0.023471
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000779
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002002 | Grad Max: 0.005150
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000549
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000916 | Grad Max: 0.002567
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003117 | Grad Max: 0.006993
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035958 | Grad Max: 0.035958
[GRADIENT NORM TOTAL] 1.7951

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.175
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51424253 0.48575744] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 66/1982 | C: 178/1870
[LOSS Ex1] A: 0.68641 | B: 0.68723 | C: 0.68551
[LOGITS Ex2 A] Mean Abs: 0.716 | Max: 3.689
[LOSS Ex2] A: 0.45461 | B: 0.46808 | C: 0.46917
** [JOINT LOSS] ** : 1.150337
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001294 | Grad Max: 0.028399
  -> Layer: shared_layers.0.bias | Grad Mean: 0.027702 | Grad Max: 0.146661
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.010998
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.024301 | Grad Max: 0.024301
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000299 | Grad Max: 0.015701
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005096 | Grad Max: 0.085155
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.003171
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002293 | Grad Max: 0.011854
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000387
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000640 | Grad Max: 0.002047
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000267
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000294 | Grad Max: 0.001263
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001175 | Grad Max: 0.003974
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012939 | Grad Max: 0.012939
[GRADIENT NORM TOTAL] 0.6098

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.195
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105915  0.48940846] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 76/1780 | C: 209/1839
[LOSS Ex1] A: 0.68691 | B: 0.68873 | C: 0.68378
[LOGITS Ex2 A] Mean Abs: 0.717 | Max: 3.417
[LOSS Ex2] A: 0.47141 | B: 0.46335 | C: 0.47832
** [JOINT LOSS] ** : 1.157498
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002792 | Grad Max: 0.088956
  -> Layer: shared_layers.0.bias | Grad Mean: 0.073397 | Grad Max: 0.408091
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001834 | Grad Max: 0.009992
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018235 | Grad Max: 0.018235
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000733 | Grad Max: 0.026108
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013026 | Grad Max: 0.130656
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000199 | Grad Max: 0.003871
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006127 | Grad Max: 0.020543
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000742
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001740 | Grad Max: 0.004384
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000542
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000793 | Grad Max: 0.002497
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002875 | Grad Max: 0.006277
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032277 | Grad Max: 0.032277
[GRADIENT NORM TOTAL] 1.5273

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5164506  0.48354942] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.004
[MASKS] A(Pass/Fail): 0/2048 | B: 77/1971 | C: 184/1864
[LOSS Ex1] A: 0.00000 | B: 0.68842 | C: 0.68585
[LOGITS Ex2 A] Mean Abs: 0.733 | Max: 3.682
[LOSS Ex2] A: 0.45912 | B: 0.48607 | C: 0.46631
** [JOINT LOSS] ** : 0.928588
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001227 | Grad Max: 0.017235
  -> Layer: shared_layers.0.bias | Grad Mean: 0.019766 | Grad Max: 0.112513
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001331 | Grad Max: 0.005052
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022041 | Grad Max: 0.022041
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000201 | Grad Max: 0.013877
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.003366 | Grad Max: 0.075387
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.001717
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001086 | Grad Max: 0.007974
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000245
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000275 | Grad Max: 0.001209
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000166
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000123 | Grad Max: 0.000599
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000453 | Grad Max: 0.001686
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004798 | Grad Max: 0.004798
[GRADIENT NORM TOTAL] 0.4242

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.107
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5052409 0.4947591] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.004
[MASKS] A(Pass/Fail): 0/1616 | B: 76/1972 | C: 146/1230
[LOSS Ex1] A: 0.00000 | B: 0.68857 | C: 0.68262
[LOGITS Ex2 A] Mean Abs: 0.772 | Max: 3.828
[LOSS Ex2] A: 0.44620 | B: 0.47798 | C: 0.46226
** [JOINT LOSS] ** : 0.919210
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001442 | Grad Max: 0.027572
  -> Layer: shared_layers.0.bias | Grad Mean: 0.053342 | Grad Max: 0.344007
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001076 | Grad Max: 0.005398
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006955 | Grad Max: 0.006955
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000480 | Grad Max: 0.020292
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009116 | Grad Max: 0.115829
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000131 | Grad Max: 0.003272
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004173 | Grad Max: 0.014280
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000586
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001153 | Grad Max: 0.003087
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000369
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000519 | Grad Max: 0.001595
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001561 | Grad Max: 0.004042
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019281 | Grad Max: 0.019281
[GRADIENT NORM TOTAL] 1.1196

[EPOCH SUMMARY] Train Loss: 1.0608

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.1271 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.1291 -> New: 1.1271)

############################## EPOCH 10/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.197
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5133361 0.4866639] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 72/1976 | C: 194/1854
[LOSS Ex1] A: 0.68600 | B: 0.68705 | C: 0.68479
[LOGITS Ex2 A] Mean Abs: 0.747 | Max: 3.831
[LOSS Ex2] A: 0.45404 | B: 0.45926 | C: 0.47325
** [JOINT LOSS] ** : 1.148130
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001364 | Grad Max: 0.025969
  -> Layer: shared_layers.0.bias | Grad Mean: 0.013280 | Grad Max: 0.061942
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001941 | Grad Max: 0.010563
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017898 | Grad Max: 0.017898
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000188 | Grad Max: 0.011204
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002706 | Grad Max: 0.060313
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000039 | Grad Max: 0.001714
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000994 | Grad Max: 0.007532
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000199
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000269 | Grad Max: 0.001084
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000185
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000123 | Grad Max: 0.000587
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000653 | Grad Max: 0.002778
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006004 | Grad Max: 0.006004
[GRADIENT NORM TOTAL] 0.3528

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.195
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5077994  0.49220058] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 87/1769 | C: 186/1862
[LOSS Ex1] A: 0.68605 | B: 0.68856 | C: 0.68430
[LOGITS Ex2 A] Mean Abs: 0.743 | Max: 3.517
[LOSS Ex2] A: 0.44743 | B: 0.46270 | C: 0.46695
** [JOINT LOSS] ** : 1.145330
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001474 | Grad Max: 0.030208
  -> Layer: shared_layers.0.bias | Grad Mean: 0.032230 | Grad Max: 0.217679
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.011473
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.025315 | Grad Max: 0.025315
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000354 | Grad Max: 0.013347
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006141 | Grad Max: 0.072417
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.003009
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002832 | Grad Max: 0.014176
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000378
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000794 | Grad Max: 0.002349
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000310
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000360 | Grad Max: 0.001302
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001383 | Grad Max: 0.003999
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015233 | Grad Max: 0.015233
[GRADIENT NORM TOTAL] 0.7299

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.199
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5123466  0.48765343] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 89/1959 | C: 196/1852
[LOSS Ex1] A: 0.68509 | B: 0.68824 | C: 0.68388
[LOGITS Ex2 A] Mean Abs: 0.748 | Max: 3.794
[LOSS Ex2] A: 0.44872 | B: 0.49099 | C: 0.44997
** [JOINT LOSS] ** : 1.148964
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001903 | Grad Max: 0.059118
  -> Layer: shared_layers.0.bias | Grad Mean: 0.028637 | Grad Max: 0.115661
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001943 | Grad Max: 0.010471
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014613 | Grad Max: 0.014613
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000318 | Grad Max: 0.021059
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005263 | Grad Max: 0.114654
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.002075
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001963 | Grad Max: 0.008975
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000349
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000530 | Grad Max: 0.001910
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000236 | Grad Max: 0.000945
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000730 | Grad Max: 0.002600
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008213 | Grad Max: 0.008213
[GRADIENT NORM TOTAL] 0.6594

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.179
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51357454 0.48642546] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 2/2046 | B: 88/1960 | C: 184/1864
[LOSS Ex1] A: 0.68585 | B: 0.68840 | C: 0.68413
[LOGITS Ex2 A] Mean Abs: 0.726 | Max: 3.530
[LOSS Ex2] A: 0.45699 | B: 0.46714 | C: 0.47432
** [JOINT LOSS] ** : 1.152276
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001219 | Grad Max: 0.020011
  -> Layer: shared_layers.0.bias | Grad Mean: 0.011951 | Grad Max: 0.092182
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001641 | Grad Max: 0.008473
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006319 | Grad Max: 0.006319
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000158 | Grad Max: 0.028019
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002442 | Grad Max: 0.158371
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000026 | Grad Max: 0.001688
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000516 | Grad Max: 0.005438
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000208
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000125 | Grad Max: 0.000972
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000154
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000057 | Grad Max: 0.000452
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000486 | Grad Max: 0.001970
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001763 | Grad Max: 0.001763
[GRADIENT NORM TOTAL] 0.3845

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.138
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009043  0.49909562] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 78/1970 | C: 221/1827
[LOSS Ex1] A: 0.00000 | B: 0.68687 | C: 0.68279
[LOGITS Ex2 A] Mean Abs: 0.713 | Max: 3.564
[LOSS Ex2] A: 0.45425 | B: 0.47095 | C: 0.47844
** [JOINT LOSS] ** : 0.924433
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001235 | Grad Max: 0.024147
  -> Layer: shared_layers.0.bias | Grad Mean: 0.010344 | Grad Max: 0.074168
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001157 | Grad Max: 0.005738
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010122 | Grad Max: 0.010122
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000163 | Grad Max: 0.019803
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002313 | Grad Max: 0.101517
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000029 | Grad Max: 0.001556
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000600 | Grad Max: 0.006855
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000246
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000141 | Grad Max: 0.001169
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000141
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000063 | Grad Max: 0.000424
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000391 | Grad Max: 0.001862
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003211 | Grad Max: 0.003211
[GRADIENT NORM TOTAL] 0.3264

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.179
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5161198  0.48388022] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 90/1766 | C: 224/1824
[LOSS Ex1] A: 0.68602 | B: 0.68838 | C: 0.68205
[LOGITS Ex2 A] Mean Abs: 0.752 | Max: 3.812
[LOSS Ex2] A: 0.44935 | B: 0.45496 | C: 0.46077
** [JOINT LOSS] ** : 1.140511
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001490 | Grad Max: 0.026572
  -> Layer: shared_layers.0.bias | Grad Mean: 0.030728 | Grad Max: 0.182884
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001974 | Grad Max: 0.010803
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019324 | Grad Max: 0.019324
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000308 | Grad Max: 0.021166
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005375 | Grad Max: 0.116976
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.002503
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002270 | Grad Max: 0.010244
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000334
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000605 | Grad Max: 0.001972
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000249
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000265 | Grad Max: 0.001077
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000872 | Grad Max: 0.003017
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009784 | Grad Max: 0.009784
[GRADIENT NORM TOTAL] 0.6627

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.200
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.509994   0.49000606] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 89/1959 | C: 197/1851
[LOSS Ex1] A: 0.68661 | B: 0.68806 | C: 0.68420
[LOGITS Ex2 A] Mean Abs: 0.740 | Max: 3.717
[LOSS Ex2] A: 0.45434 | B: 0.48081 | C: 0.47417
** [JOINT LOSS] ** : 1.156062
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002657 | Grad Max: 0.074216
  -> Layer: shared_layers.0.bias | Grad Mean: 0.076079 | Grad Max: 0.447547
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001826 | Grad Max: 0.009485
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016288 | Grad Max: 0.016288
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000754 | Grad Max: 0.030116
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013695 | Grad Max: 0.159137
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000202 | Grad Max: 0.004444
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006343 | Grad Max: 0.021485
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000662
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001747 | Grad Max: 0.004416
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000536
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000780 | Grad Max: 0.002608
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002789 | Grad Max: 0.006133
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032289 | Grad Max: 0.032289
[GRADIENT NORM TOTAL] 1.5949

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.085
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51812553 0.4818745 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 89/1959 | C: 177/1871
[LOSS Ex1] A: 0.00000 | B: 0.68823 | C: 0.68418
[LOGITS Ex2 A] Mean Abs: 0.745 | Max: 3.669
[LOSS Ex2] A: 0.45280 | B: 0.46137 | C: 0.46218
** [JOINT LOSS] ** : 0.916252
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001208 | Grad Max: 0.018785
  -> Layer: shared_layers.0.bias | Grad Mean: 0.011505 | Grad Max: 0.069626
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001056 | Grad Max: 0.004796
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012734 | Grad Max: 0.012734
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000170 | Grad Max: 0.012804
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002590 | Grad Max: 0.066970
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000035 | Grad Max: 0.001644
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000908 | Grad Max: 0.006190
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000223
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000242 | Grad Max: 0.001161
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000193
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000105 | Grad Max: 0.000646
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000546 | Grad Max: 0.002208
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004485 | Grad Max: 0.004485
[GRADIENT NORM TOTAL] 0.3257

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.107
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5064614 0.4935386] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/1616 | B: 81/1967 | C: 207/1841
[LOSS Ex1] A: 0.00000 | B: 0.68668 | C: 0.68280
[LOGITS Ex2 A] Mean Abs: 0.823 | Max: 3.585
[LOSS Ex2] A: 0.42787 | B: 0.47280 | C: 0.45522
** [JOINT LOSS] ** : 0.908454
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.051861
  -> Layer: shared_layers.0.bias | Grad Mean: 0.083968 | Grad Max: 0.499453
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001103 | Grad Max: 0.005288
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007069 | Grad Max: 0.007069
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000733 | Grad Max: 0.029058
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014010 | Grad Max: 0.153824
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000209 | Grad Max: 0.005177
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006649 | Grad Max: 0.022915
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000770
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001777 | Grad Max: 0.004704
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000434
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000776 | Grad Max: 0.002254
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002381 | Grad Max: 0.005399
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029207 | Grad Max: 0.029207
[GRADIENT NORM TOTAL] 1.7038

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.202
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5128182  0.48718178] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 91/1765 | C: 190/1858
[LOSS Ex1] A: 0.68565 | B: 0.68820 | C: 0.68346
[LOGITS Ex2 A] Mean Abs: 0.783 | Max: 3.681
[LOSS Ex2] A: 0.44157 | B: 0.46114 | C: 0.45594
** [JOINT LOSS] ** : 1.138656
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001863 | Grad Max: 0.036404
  -> Layer: shared_layers.0.bias | Grad Mean: 0.048264 | Grad Max: 0.313620
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001849 | Grad Max: 0.010262
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016002 | Grad Max: 0.016002
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000462 | Grad Max: 0.029540
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008653 | Grad Max: 0.167199
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000125 | Grad Max: 0.003473
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003917 | Grad Max: 0.015261
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000484
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001036 | Grad Max: 0.002923
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000373
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000455 | Grad Max: 0.001676
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001441 | Grad Max: 0.004378
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017561 | Grad Max: 0.017561
[GRADIENT NORM TOTAL] 1.0339

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.200
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50717443 0.49282557] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 91/1957 | C: 197/1851
[LOSS Ex1] A: 0.68567 | B: 0.68788 | C: 0.68540
[LOGITS Ex2 A] Mean Abs: 0.770 | Max: 3.859
[LOSS Ex2] A: 0.44760 | B: 0.49874 | C: 0.47310
** [JOINT LOSS] ** : 1.159465
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003548 | Grad Max: 0.104352
  -> Layer: shared_layers.0.bias | Grad Mean: 0.156232 | Grad Max: 0.959014
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.011222
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.026301 | Grad Max: 0.026301
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001443 | Grad Max: 0.056073
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027228 | Grad Max: 0.307445
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000414 | Grad Max: 0.008781
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013226 | Grad Max: 0.041405
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001334
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003549 | Grad Max: 0.008412
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000065 | Grad Max: 0.000903
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001546 | Grad Max: 0.004419
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005087 | Grad Max: 0.010448
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.060951 | Grad Max: 0.060951
[GRADIENT NORM TOTAL] 3.2571

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.204
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5117793  0.48822075] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 92/1956 | C: 232/1816
[LOSS Ex1] A: 0.68473 | B: 0.68806 | C: 0.68270
[LOGITS Ex2 A] Mean Abs: 0.769 | Max: 3.975
[LOSS Ex2] A: 0.44994 | B: 0.48762 | C: 0.49775
** [JOINT LOSS] ** : 1.163594
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003526 | Grad Max: 0.115918
  -> Layer: shared_layers.0.bias | Grad Mean: 0.203352 | Grad Max: 1.268148
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001916 | Grad Max: 0.010120
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010959 | Grad Max: 0.010959
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001784 | Grad Max: 0.063381
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034152 | Grad Max: 0.359569
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000520 | Grad Max: 0.011182
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016812 | Grad Max: 0.056261
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000126 | Grad Max: 0.001573
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004479 | Grad Max: 0.010434
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000081 | Grad Max: 0.001096
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001954 | Grad Max: 0.005482
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.006391 | Grad Max: 0.012832
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.077901 | Grad Max: 0.077901
[GRADIENT NORM TOTAL] 4.1959

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.183
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5130732 0.4869268] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 2/2046 | B: 84/1964 | C: 218/1830
[LOSS Ex1] A: 0.68559 | B: 0.68650 | C: 0.68307
[LOGITS Ex2 A] Mean Abs: 0.756 | Max: 3.874
[LOSS Ex2] A: 0.45297 | B: 0.45377 | C: 0.45457
** [JOINT LOSS] ** : 1.138829
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002228 | Grad Max: 0.057328
  -> Layer: shared_layers.0.bias | Grad Mean: 0.065638 | Grad Max: 0.412930
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001777 | Grad Max: 0.008875
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007432 | Grad Max: 0.007432
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000657 | Grad Max: 0.024165
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011974 | Grad Max: 0.129580
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000179 | Grad Max: 0.004838
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005686 | Grad Max: 0.021343
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000675
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001537 | Grad Max: 0.004156
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000401
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000670 | Grad Max: 0.002016
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002291 | Grad Max: 0.006314
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026883 | Grad Max: 0.026883
[GRADIENT NORM TOTAL] 1.4097

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.139
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000784  0.49992162] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 91/1765 | C: 147/1229
[LOSS Ex1] A: 0.00000 | B: 0.68804 | C: 0.68306
[LOGITS Ex2 A] Mean Abs: 0.828 | Max: 3.737
[LOSS Ex2] A: 0.45484 | B: 0.47834 | C: 0.44592
** [JOINT LOSS] ** : 0.916735
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006019 | Grad Max: 0.165601
  -> Layer: shared_layers.0.bias | Grad Mean: 0.209304 | Grad Max: 1.274282
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001185 | Grad Max: 0.005551
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015166 | Grad Max: 0.015166
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.071834
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037146 | Grad Max: 0.345268
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000556 | Grad Max: 0.012017
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017591 | Grad Max: 0.058004
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000134 | Grad Max: 0.001550
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004677 | Grad Max: 0.011137
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000086 | Grad Max: 0.001138
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002022 | Grad Max: 0.005710
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.006541 | Grad Max: 0.014409
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.077697 | Grad Max: 0.077697
[GRADIENT NORM TOTAL] 4.3144

[EPOCH SUMMARY] Train Loss: 1.0827

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.1723 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 11/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.183
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51775277 0.4822472 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 92/1956 | C: 217/1831
[LOSS Ex1] A: 0.68568 | B: 0.68773 | C: 0.68273
[LOGITS Ex2 A] Mean Abs: 0.899 | Max: 3.696
[LOSS Ex2] A: 0.49196 | B: 0.52176 | C: 0.46937
** [JOINT LOSS] ** : 1.179742
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007432 | Grad Max: 0.220869
  -> Layer: shared_layers.0.bias | Grad Mean: 0.294796 | Grad Max: 1.818786
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001916 | Grad Max: 0.010113
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015760 | Grad Max: 0.015760
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002733 | Grad Max: 0.105764
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051560 | Grad Max: 0.548172
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000772 | Grad Max: 0.016081
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024593 | Grad Max: 0.077051
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000186 | Grad Max: 0.002244
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006525 | Grad Max: 0.015768
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000118 | Grad Max: 0.001464
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002805 | Grad Max: 0.007749
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.009204 | Grad Max: 0.022404
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.108507 | Grad Max: 0.108507
[GRADIENT NORM TOTAL] 6.1022

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.204
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5095347  0.49046525] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 95/1953 | C: 190/1858
[LOSS Ex1] A: 0.68635 | B: 0.68792 | C: 0.68505
[LOGITS Ex2 A] Mean Abs: 0.854 | Max: 3.611
[LOSS Ex2] A: 0.47214 | B: 0.49705 | C: 0.48243
** [JOINT LOSS] ** : 1.170317
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005289 | Grad Max: 0.159863
  -> Layer: shared_layers.0.bias | Grad Mean: 0.243136 | Grad Max: 1.496643
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001765 | Grad Max: 0.009411
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016929 | Grad Max: 0.016929
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.081376
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041880 | Grad Max: 0.443695
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000629 | Grad Max: 0.012926
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020254 | Grad Max: 0.064436
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000151 | Grad Max: 0.001847
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005346 | Grad Max: 0.012582
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000096 | Grad Max: 0.001236
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002302 | Grad Max: 0.006359
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.007379 | Grad Max: 0.017281
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.088368 | Grad Max: 0.088368
[GRADIENT NORM TOTAL] 5.0254

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.085
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51944965 0.48055032] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 85/1963 | C: 210/1838
[LOSS Ex1] A: 0.00000 | B: 0.68636 | C: 0.68456
[LOGITS Ex2 A] Mean Abs: 0.785 | Max: 3.828
[LOSS Ex2] A: 0.43845 | B: 0.44822 | C: 0.45292
** [JOINT LOSS] ** : 0.903504
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001182 | Grad Max: 0.025647
  -> Layer: shared_layers.0.bias | Grad Mean: 0.024016 | Grad Max: 0.124068
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001361 | Grad Max: 0.005784
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022667 | Grad Max: 0.022667
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000226 | Grad Max: 0.014092
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004099 | Grad Max: 0.074218
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.001900
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001739 | Grad Max: 0.007864
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000273
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000440 | Grad Max: 0.001520
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000189
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000190 | Grad Max: 0.000739
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000603 | Grad Max: 0.002320
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006619 | Grad Max: 0.006619
[GRADIENT NORM TOTAL] 0.5075

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.108
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074163  0.49258372] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/1616 | B: 94/1762 | C: 207/1841
[LOSS Ex1] A: 0.00000 | B: 0.68791 | C: 0.68303
[LOGITS Ex2 A] Mean Abs: 0.815 | Max: 3.806
[LOSS Ex2] A: 0.46002 | B: 0.47158 | C: 0.49686
** [JOINT LOSS] ** : 0.933134
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006456 | Grad Max: 0.174408
  -> Layer: shared_layers.0.bias | Grad Mean: 0.264561 | Grad Max: 1.650712
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001190 | Grad Max: 0.005160
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016610 | Grad Max: 0.016610
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002502 | Grad Max: 0.092515
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046601 | Grad Max: 0.504049
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000698 | Grad Max: 0.015983
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022272 | Grad Max: 0.072204
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000167 | Grad Max: 0.002039
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005898 | Grad Max: 0.013480
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000106 | Grad Max: 0.001384
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002554 | Grad Max: 0.007446
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.008337 | Grad Max: 0.016748
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.100925 | Grad Max: 0.100925
[GRADIENT NORM TOTAL] 5.5279

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.206
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5124308  0.48756918] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 95/1953 | C: 227/1821
[LOSS Ex1] A: 0.68537 | B: 0.68760 | C: 0.68095
[LOGITS Ex2 A] Mean Abs: 0.839 | Max: 3.956
[LOSS Ex2] A: 0.48035 | B: 0.52403 | C: 0.54206
** [JOINT LOSS] ** : 1.200122
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008331 | Grad Max: 0.253883
  -> Layer: shared_layers.0.bias | Grad Mean: 0.345767 | Grad Max: 2.132583
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001871 | Grad Max: 0.009429
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006321 | Grad Max: 0.006321
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003242 | Grad Max: 0.116643
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.061228 | Grad Max: 0.645406
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000921 | Grad Max: 0.020877
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029535 | Grad Max: 0.102221
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000219 | Grad Max: 0.002754
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007800 | Grad Max: 0.018314
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000139 | Grad Max: 0.001913
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003383 | Grad Max: 0.009623
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.010841 | Grad Max: 0.023629
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.133612 | Grad Max: 0.133612
[GRADIENT NORM TOTAL] 7.2372

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.203
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067255 0.4932745] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 98/1950 | C: 192/1856
[LOSS Ex1] A: 0.68540 | B: 0.68780 | C: 0.68416
[LOGITS Ex2 A] Mean Abs: 0.814 | Max: 3.907
[LOSS Ex2] A: 0.46384 | B: 0.49383 | C: 0.49285
** [JOINT LOSS] ** : 1.169291
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006528 | Grad Max: 0.192062
  -> Layer: shared_layers.0.bias | Grad Mean: 0.256931 | Grad Max: 1.582020
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001903 | Grad Max: 0.010326
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017335 | Grad Max: 0.017335
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002440 | Grad Max: 0.090673
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046072 | Grad Max: 0.501599
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000695 | Grad Max: 0.017436
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022252 | Grad Max: 0.080264
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000165 | Grad Max: 0.001894
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005879 | Grad Max: 0.013145
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000104 | Grad Max: 0.001353
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002543 | Grad Max: 0.006924
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.008208 | Grad Max: 0.017087
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.100520 | Grad Max: 0.100520
[GRADIENT NORM TOTAL] 5.4050

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.207
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5113705  0.48862952] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 87/1961 | C: 205/1843
[LOSS Ex1] A: 0.68447 | B: 0.68624 | C: 0.68339
[LOGITS Ex2 A] Mean Abs: 0.787 | Max: 3.857
[LOSS Ex2] A: 0.43641 | B: 0.45016 | C: 0.45994
** [JOINT LOSS] ** : 1.133537
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001384 | Grad Max: 0.025742
  -> Layer: shared_layers.0.bias | Grad Mean: 0.034403 | Grad Max: 0.181037
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001979 | Grad Max: 0.010472
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013243 | Grad Max: 0.013243
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000363 | Grad Max: 0.024432
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006143 | Grad Max: 0.129939
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.002748
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002677 | Grad Max: 0.013278
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000345
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000703 | Grad Max: 0.002268
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000209
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000301 | Grad Max: 0.000988
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001029 | Grad Max: 0.003848
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012236 | Grad Max: 0.012236
[GRADIENT NORM TOTAL] 0.7577

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.186
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5127044 0.4872957] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 2/2046 | B: 94/1762 | C: 205/1843
[LOSS Ex1] A: 0.68542 | B: 0.68780 | C: 0.68290
[LOGITS Ex2 A] Mean Abs: 0.888 | Max: 3.773
[LOSS Ex2] A: 0.47007 | B: 0.49505 | C: 0.45873
** [JOINT LOSS] ** : 1.159994
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005568 | Grad Max: 0.147721
  -> Layer: shared_layers.0.bias | Grad Mean: 0.264905 | Grad Max: 1.623303
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001716 | Grad Max: 0.008827
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008575 | Grad Max: 0.008575
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002317 | Grad Max: 0.083412
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044196 | Grad Max: 0.471933
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000651 | Grad Max: 0.013522
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021051 | Grad Max: 0.068584
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000154 | Grad Max: 0.001844
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005557 | Grad Max: 0.012710
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000096 | Grad Max: 0.001166
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002391 | Grad Max: 0.006458
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.007376 | Grad Max: 0.017046
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.091763 | Grad Max: 0.091763
[GRADIENT NORM TOTAL] 5.3618

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.140
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50048566 0.4995143 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 96/1952 | C: 224/1824
[LOSS Ex1] A: 0.00000 | B: 0.68750 | C: 0.68225
[LOGITS Ex2 A] Mean Abs: 0.945 | Max: 3.631
[LOSS Ex2] A: 0.50740 | B: 0.55566 | C: 0.51999
** [JOINT LOSS] ** : 0.984264
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009171 | Grad Max: 0.254047
  -> Layer: shared_layers.0.bias | Grad Mean: 0.373188 | Grad Max: 2.268105
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001136 | Grad Max: 0.005231
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011432 | Grad Max: 0.011432
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003393 | Grad Max: 0.120513
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064149 | Grad Max: 0.624417
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000944 | Grad Max: 0.020404
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030255 | Grad Max: 0.100224
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000222 | Grad Max: 0.002570
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007980 | Grad Max: 0.018242
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000140 | Grad Max: 0.001766
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003435 | Grad Max: 0.009557
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.011157 | Grad Max: 0.028839
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.133817 | Grad Max: 0.133817
[GRADIENT NORM TOTAL] 7.6036

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.186
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5188811  0.48111892] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 100/1948 | C: 188/1860
[LOSS Ex1] A: 0.68544 | B: 0.68770 | C: 0.68641
[LOGITS Ex2 A] Mean Abs: 0.932 | Max: 3.645
[LOSS Ex2] A: 0.49208 | B: 0.53725 | C: 0.51779
** [JOINT LOSS] ** : 1.202221
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008743 | Grad Max: 0.259802
  -> Layer: shared_layers.0.bias | Grad Mean: 0.384300 | Grad Max: 2.333114
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.011263
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.027465 | Grad Max: 0.027465
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003458 | Grad Max: 0.127467
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065471 | Grad Max: 0.711829
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000969 | Grad Max: 0.021459
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031124 | Grad Max: 0.106107
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000228 | Grad Max: 0.002781
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008212 | Grad Max: 0.019112
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000143 | Grad Max: 0.001702
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003538 | Grad Max: 0.009761
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.011196 | Grad Max: 0.027571
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.136575 | Grad Max: 0.136575
[GRADIENT NORM TOTAL] 7.8462

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.208
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5091831  0.49081695] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 3/2045 | B: 88/1960 | C: 221/1827
[LOSS Ex1] A: 0.68618 | B: 0.68613 | C: 0.68311
[LOGITS Ex2 A] Mean Abs: 0.840 | Max: 3.843
[LOSS Ex2] A: 0.44925 | B: 0.47339 | C: 0.45054
** [JOINT LOSS] ** : 1.142866
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003184 | Grad Max: 0.089576
  -> Layer: shared_layers.0.bias | Grad Mean: 0.177262 | Grad Max: 1.098090
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001743 | Grad Max: 0.008875
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009117 | Grad Max: 0.009117
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001499 | Grad Max: 0.060501
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028958 | Grad Max: 0.343603
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000435 | Grad Max: 0.009946
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014187 | Grad Max: 0.049092
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001328
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003725 | Grad Max: 0.008749
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000063 | Grad Max: 0.000796
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001607 | Grad Max: 0.004516
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004788 | Grad Max: 0.010848
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.061249 | Grad Max: 0.061249
[GRADIENT NORM TOTAL] 3.6332

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.085
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52042127 0.47957873] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 96/1760 | C: 237/1811
[LOSS Ex1] A: 0.00000 | B: 0.68771 | C: 0.68163
[LOGITS Ex2 A] Mean Abs: 0.789 | Max: 3.809
[LOSS Ex2] A: 0.43687 | B: 0.45023 | C: 0.45190
** [JOINT LOSS] ** : 0.902783
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003730 | Grad Max: 0.123758
  -> Layer: shared_layers.0.bias | Grad Mean: 0.124059 | Grad Max: 0.764436
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001195 | Grad Max: 0.005330
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013041 | Grad Max: 0.013041
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001227 | Grad Max: 0.063243
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022845 | Grad Max: 0.333691
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.007201
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010642 | Grad Max: 0.035396
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001012
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002833 | Grad Max: 0.006658
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000659
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001226 | Grad Max: 0.003372
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003952 | Grad Max: 0.009789
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048031 | Grad Max: 0.048031
[GRADIENT NORM TOTAL] 2.6572

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.109
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50810415 0.49189588] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/1616 | B: 98/1950 | C: 239/1809
[LOSS Ex1] A: 0.00000 | B: 0.68740 | C: 0.68156
[LOGITS Ex2 A] Mean Abs: 0.828 | Max: 3.712
[LOSS Ex2] A: 0.46606 | B: 0.48997 | C: 0.47843
** [JOINT LOSS] ** : 0.934474
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006954 | Grad Max: 0.192340
  -> Layer: shared_layers.0.bias | Grad Mean: 0.263418 | Grad Max: 1.605785
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001155 | Grad Max: 0.005525
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009388 | Grad Max: 0.009388
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002477 | Grad Max: 0.105378
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046345 | Grad Max: 0.540835
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000684 | Grad Max: 0.017244
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021928 | Grad Max: 0.083481
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000160 | Grad Max: 0.002006
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005807 | Grad Max: 0.013522
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000101 | Grad Max: 0.001346
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002525 | Grad Max: 0.007269
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.007987 | Grad Max: 0.015838
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.099319 | Grad Max: 0.099319
[GRADIENT NORM TOTAL] 5.4524

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.208
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5121652  0.48783478] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 100/1948 | C: 138/1238
[LOSS Ex1] A: 0.68519 | B: 0.68761 | C: 0.68273
[LOGITS Ex2 A] Mean Abs: 0.829 | Max: 3.998
[LOSS Ex2] A: 0.44233 | B: 0.46888 | C: 0.48549
** [JOINT LOSS] ** : 1.150741
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005712 | Grad Max: 0.158692
  -> Layer: shared_layers.0.bias | Grad Mean: 0.213038 | Grad Max: 1.283029
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001876 | Grad Max: 0.010145
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014387 | Grad Max: 0.014387
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001995 | Grad Max: 0.086474
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037363 | Grad Max: 0.445395
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000545 | Grad Max: 0.012374
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017526 | Grad Max: 0.061047
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000128 | Grad Max: 0.001594
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004657 | Grad Max: 0.010950
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000080 | Grad Max: 0.001051
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002029 | Grad Max: 0.005876
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.006369 | Grad Max: 0.013209
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.080014 | Grad Max: 0.080014
[GRADIENT NORM TOTAL] 4.3904

[EPOCH SUMMARY] Train Loss: 1.0834

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.1089 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.1271 -> New: 1.1089)

############################## EPOCH 12/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.206
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50640607 0.493594  ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 90/1958 | C: 213/1835
[LOSS Ex1] A: 0.68520 | B: 0.68604 | C: 0.68326
[LOGITS Ex2 A] Mean Abs: 0.802 | Max: 3.707
[LOSS Ex2] A: 0.43108 | B: 0.45379 | C: 0.45459
** [JOINT LOSS] ** : 1.131323
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001815 | Grad Max: 0.042635
  -> Layer: shared_layers.0.bias | Grad Mean: 0.028697 | Grad Max: 0.180230
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.010694
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017788 | Grad Max: 0.017788
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000340 | Grad Max: 0.040797
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005643 | Grad Max: 0.234885
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.003379
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002363 | Grad Max: 0.014826
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000379
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000645 | Grad Max: 0.002067
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000266
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000283 | Grad Max: 0.001214
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001089 | Grad Max: 0.003535
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011870 | Grad Max: 0.011870
[GRADIENT NORM TOTAL] 0.7092

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.210
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5110951 0.4889049] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 97/1759 | C: 219/1829
[LOSS Ex1] A: 0.68428 | B: 0.68762 | C: 0.68314
[LOGITS Ex2 A] Mean Abs: 0.863 | Max: 3.666
[LOSS Ex2] A: 0.45505 | B: 0.47324 | C: 0.45412
** [JOINT LOSS] ** : 1.145818
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006307 | Grad Max: 0.177276
  -> Layer: shared_layers.0.bias | Grad Mean: 0.203399 | Grad Max: 1.223315
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002002 | Grad Max: 0.010832
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016064 | Grad Max: 0.016064
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001973 | Grad Max: 0.066316
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036688 | Grad Max: 0.365015
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000537 | Grad Max: 0.012486
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017078 | Grad Max: 0.060265
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000125 | Grad Max: 0.001569
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004529 | Grad Max: 0.010819
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000078 | Grad Max: 0.001081
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001968 | Grad Max: 0.005740
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.006026 | Grad Max: 0.013751
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.075415 | Grad Max: 0.075415
[GRADIENT NORM TOTAL] 4.2289

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.188
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51245505 0.48754498] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 2/2046 | B: 99/1949 | C: 207/1841
[LOSS Ex1] A: 0.68529 | B: 0.68732 | C: 0.68429
[LOGITS Ex2 A] Mean Abs: 0.894 | Max: 3.482
[LOSS Ex2] A: 0.46663 | B: 0.52036 | C: 0.48151
** [JOINT LOSS] ** : 1.175133
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006488 | Grad Max: 0.170864
  -> Layer: shared_layers.0.bias | Grad Mean: 0.275940 | Grad Max: 1.695373
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001675 | Grad Max: 0.008605
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009216 | Grad Max: 0.009216
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002532 | Grad Max: 0.093223
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048040 | Grad Max: 0.522449
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000701 | Grad Max: 0.015891
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022651 | Grad Max: 0.079529
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000162 | Grad Max: 0.002078
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005988 | Grad Max: 0.014215
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000101 | Grad Max: 0.001234
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002601 | Grad Max: 0.007138
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.007884 | Grad Max: 0.018929
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.099869 | Grad Max: 0.099869
[GRADIENT NORM TOTAL] 5.6987

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.140
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50089246 0.49910757] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 101/1947 | C: 220/1828
[LOSS Ex1] A: 0.00000 | B: 0.68753 | C: 0.68199
[LOGITS Ex2 A] Mean Abs: 0.825 | Max: 3.778
[LOSS Ex2] A: 0.45229 | B: 0.47766 | C: 0.46765
** [JOINT LOSS] ** : 0.922374
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005159 | Grad Max: 0.143156
  -> Layer: shared_layers.0.bias | Grad Mean: 0.187044 | Grad Max: 1.126397
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001101 | Grad Max: 0.005395
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006424 | Grad Max: 0.006424
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001757 | Grad Max: 0.067053
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033002 | Grad Max: 0.367156
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000482 | Grad Max: 0.010852
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015460 | Grad Max: 0.052853
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001382
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004095 | Grad Max: 0.009503
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000070 | Grad Max: 0.000972
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001781 | Grad Max: 0.005360
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005362 | Grad Max: 0.012571
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.067999 | Grad Max: 0.067999
[GRADIENT NORM TOTAL] 3.8925

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.188
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51972604 0.48027393] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 90/1958 | C: 199/1849
[LOSS Ex1] A: 0.68526 | B: 0.68595 | C: 0.68345
[LOGITS Ex2 A] Mean Abs: 0.783 | Max: 3.762
[LOSS Ex2] A: 0.42220 | B: 0.44949 | C: 0.45103
** [JOINT LOSS] ** : 1.125791
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001523 | Grad Max: 0.027018
  -> Layer: shared_layers.0.bias | Grad Mean: 0.011775 | Grad Max: 0.083810
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001968 | Grad Max: 0.010420
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019576 | Grad Max: 0.019576
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000184 | Grad Max: 0.011772
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002606 | Grad Max: 0.065538
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000034 | Grad Max: 0.001977
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000798 | Grad Max: 0.007720
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000188
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000193 | Grad Max: 0.001102
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000150
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000532
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000732 | Grad Max: 0.002138
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002184 | Grad Max: 0.002184
[GRADIENT NORM TOTAL] 0.3434

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.210
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5089323 0.4910677] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 97/1759 | C: 197/1851
[LOSS Ex1] A: 0.68604 | B: 0.68754 | C: 0.68321
[LOGITS Ex2 A] Mean Abs: 0.785 | Max: 3.927
[LOSS Ex2] A: 0.44756 | B: 0.45662 | C: 0.47459
** [JOINT LOSS] ** : 1.145187
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005047 | Grad Max: 0.144888
  -> Layer: shared_layers.0.bias | Grad Mean: 0.203139 | Grad Max: 1.239459
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001775 | Grad Max: 0.009303
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014286 | Grad Max: 0.014286
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001894 | Grad Max: 0.065689
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035384 | Grad Max: 0.369278
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000518 | Grad Max: 0.012840
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016653 | Grad Max: 0.064023
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000119 | Grad Max: 0.001537
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004418 | Grad Max: 0.010564
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000074 | Grad Max: 0.000971
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001928 | Grad Max: 0.005480
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005866 | Grad Max: 0.012857
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.075319 | Grad Max: 0.075319
[GRADIENT NORM TOTAL] 4.1927

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.085
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5212006 0.4787994] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 100/1948 | C: 219/1829
[LOSS Ex1] A: 0.00000 | B: 0.68723 | C: 0.68212
[LOGITS Ex2 A] Mean Abs: 0.778 | Max: 3.978
[LOSS Ex2] A: 0.46321 | B: 0.50012 | C: 0.47475
** [JOINT LOSS] ** : 0.935812
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005955 | Grad Max: 0.167478
  -> Layer: shared_layers.0.bias | Grad Mean: 0.257342 | Grad Max: 1.584616
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001145 | Grad Max: 0.005239
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012725 | Grad Max: 0.012725
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002343 | Grad Max: 0.082511
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044214 | Grad Max: 0.442254
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000647 | Grad Max: 0.014175
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020986 | Grad Max: 0.069687
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000149 | Grad Max: 0.001926
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005568 | Grad Max: 0.013106
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000093 | Grad Max: 0.001206
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002439 | Grad Max: 0.006842
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.007318 | Grad Max: 0.014900
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.095357 | Grad Max: 0.095357
[GRADIENT NORM TOTAL] 5.2928

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.109
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50868213 0.4913179 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/1616 | B: 101/1947 | C: 242/1806
[LOSS Ex1] A: 0.00000 | B: 0.68745 | C: 0.68345
[LOGITS Ex2 A] Mean Abs: 0.790 | Max: 3.806
[LOSS Ex2] A: 0.43150 | B: 0.46573 | C: 0.45664
** [JOINT LOSS] ** : 0.908255
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003649 | Grad Max: 0.104599
  -> Layer: shared_layers.0.bias | Grad Mean: 0.159580 | Grad Max: 0.997957
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001280 | Grad Max: 0.005046
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019372 | Grad Max: 0.019372
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001450 | Grad Max: 0.055166
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027191 | Grad Max: 0.286822
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000400 | Grad Max: 0.009593
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012985 | Grad Max: 0.047538
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003448 | Grad Max: 0.008302
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.000740
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001501 | Grad Max: 0.004252
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004548 | Grad Max: 0.010285
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.058409 | Grad Max: 0.058409
[GRADIENT NORM TOTAL] 3.3068

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.211
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5119357  0.48806432] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 91/1957 | C: 227/1821
[LOSS Ex1] A: 0.68502 | B: 0.68585 | C: 0.68210
[LOGITS Ex2 A] Mean Abs: 0.819 | Max: 3.608
[LOSS Ex2] A: 0.42265 | B: 0.44896 | C: 0.45588
** [JOINT LOSS] ** : 1.126823
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002380 | Grad Max: 0.071424
  -> Layer: shared_layers.0.bias | Grad Mean: 0.064190 | Grad Max: 0.371426
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001946 | Grad Max: 0.010179
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012553 | Grad Max: 0.012553
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000617 | Grad Max: 0.031395
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011624 | Grad Max: 0.157595
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.004217
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005244 | Grad Max: 0.019545
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000551
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001374 | Grad Max: 0.003705
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000376
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000602 | Grad Max: 0.001882
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001635 | Grad Max: 0.004727
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021806 | Grad Max: 0.021806
[GRADIENT NORM TOTAL] 1.3540

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.208
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50611496 0.4938851 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 1/2047 | B: 98/1758 | C: 216/1832
[LOSS Ex1] A: 0.68502 | B: 0.68745 | C: 0.68325
[LOGITS Ex2 A] Mean Abs: 0.820 | Max: 3.687
[LOSS Ex2] A: 0.43436 | B: 0.45405 | C: 0.45473
** [JOINT LOSS] ** : 1.132953
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003532 | Grad Max: 0.097544
  -> Layer: shared_layers.0.bias | Grad Mean: 0.126056 | Grad Max: 0.779230
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.010856
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.021792 | Grad Max: 0.021792
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001150 | Grad Max: 0.052640
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021886 | Grad Max: 0.263083
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000308 | Grad Max: 0.008051
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009967 | Grad Max: 0.037687
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000920
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002635 | Grad Max: 0.006442
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000593
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001152 | Grad Max: 0.003251
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003373 | Grad Max: 0.007502
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044238 | Grad Max: 0.044238
[GRADIENT NORM TOTAL] 2.5888

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.212
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108243  0.48917568] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 102/1946 | C: 214/1834
[LOSS Ex1] A: 0.68410 | B: 0.68715 | C: 0.68267
[LOGITS Ex2 A] Mean Abs: 0.807 | Max: 3.486
[LOSS Ex2] A: 0.42864 | B: 0.47492 | C: 0.45249
** [JOINT LOSS] ** : 1.136655
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.065982
  -> Layer: shared_layers.0.bias | Grad Mean: 0.039330 | Grad Max: 0.237102
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.010601
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013264 | Grad Max: 0.013264
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000394 | Grad Max: 0.036099
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007185 | Grad Max: 0.201627
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000099 | Grad Max: 0.003102
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003056 | Grad Max: 0.013530
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000396
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000805 | Grad Max: 0.002479
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000243
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000346 | Grad Max: 0.001325
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000878 | Grad Max: 0.003252
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011850 | Grad Max: 0.011850
[GRADIENT NORM TOTAL] 0.8785

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.189
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122033 0.4877967] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 2/2046 | B: 101/1947 | C: 224/1824
[LOSS Ex1] A: 0.68515 | B: 0.68736 | C: 0.68211
[LOGITS Ex2 A] Mean Abs: 0.772 | Max: 3.837
[LOSS Ex2] A: 0.45123 | B: 0.46160 | C: 0.46935
** [JOINT LOSS] ** : 1.145597
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004480 | Grad Max: 0.117355
  -> Layer: shared_layers.0.bias | Grad Mean: 0.139012 | Grad Max: 0.827945
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001743 | Grad Max: 0.008747
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007271 | Grad Max: 0.007271
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001357 | Grad Max: 0.049116
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025145 | Grad Max: 0.259510
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.008793
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011632 | Grad Max: 0.045624
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.001169
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003097 | Grad Max: 0.007766
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000627
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001354 | Grad Max: 0.003759
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004204 | Grad Max: 0.009471
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053764 | Grad Max: 0.053764
[GRADIENT NORM TOTAL] 2.8937

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.141
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013458  0.49865425] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 93/1955 | C: 210/1838
[LOSS Ex1] A: 0.00000 | B: 0.68575 | C: 0.68358
[LOGITS Ex2 A] Mean Abs: 0.758 | Max: 3.855
[LOSS Ex2] A: 0.44594 | B: 0.44928 | C: 0.47515
** [JOINT LOSS] ** : 0.913236
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004655 | Grad Max: 0.130096
  -> Layer: shared_layers.0.bias | Grad Mean: 0.159138 | Grad Max: 0.974874
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001135 | Grad Max: 0.005306
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011483 | Grad Max: 0.011483
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001519 | Grad Max: 0.056309
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028423 | Grad Max: 0.296868
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000405 | Grad Max: 0.010114
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013114 | Grad Max: 0.044648
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001202
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003488 | Grad Max: 0.008137
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000058 | Grad Max: 0.000724
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001524 | Grad Max: 0.004058
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004639 | Grad Max: 0.010389
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.060052 | Grad Max: 0.060052
[GRADIENT NORM TOTAL] 3.2975

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.190
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5206863  0.47931367] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 101/1755 | C: 149/1227
[LOSS Ex1] A: 0.68504 | B: 0.68735 | C: 0.68117
[LOGITS Ex2 A] Mean Abs: 0.812 | Max: 3.779
[LOSS Ex2] A: 0.42423 | B: 0.44758 | C: 0.46680
** [JOINT LOSS] ** : 1.130723
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.063930
  -> Layer: shared_layers.0.bias | Grad Mean: 0.045409 | Grad Max: 0.233599
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001939 | Grad Max: 0.010126
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013499 | Grad Max: 0.013499
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000476 | Grad Max: 0.020711
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008549 | Grad Max: 0.107756
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000116 | Grad Max: 0.002940
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003695 | Grad Max: 0.014620
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000394
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000982 | Grad Max: 0.002733
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000294
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000431 | Grad Max: 0.001618
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001415 | Grad Max: 0.004352
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017559 | Grad Max: 0.017559
[GRADIENT NORM TOTAL] 0.9970

[EPOCH SUMMARY] Train Loss: 1.0768

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.1278 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 13/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.213
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50866747 0.49133256] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 107/1941 | C: 224/1824
[LOSS Ex1] A: 0.68587 | B: 0.68704 | C: 0.68206
[LOGITS Ex2 A] Mean Abs: 0.851 | Max: 3.649
[LOSS Ex2] A: 0.45136 | B: 0.50492 | C: 0.44847
** [JOINT LOSS] ** : 1.153240
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003661 | Grad Max: 0.101209
  -> Layer: shared_layers.0.bias | Grad Mean: 0.198608 | Grad Max: 1.235167
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001828 | Grad Max: 0.009640
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014396 | Grad Max: 0.014396
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001701 | Grad Max: 0.069071
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033025 | Grad Max: 0.401876
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000485 | Grad Max: 0.011006
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015990 | Grad Max: 0.056619
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000109 | Grad Max: 0.001382
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004210 | Grad Max: 0.010112
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000067 | Grad Max: 0.000889
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001829 | Grad Max: 0.004942
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005259 | Grad Max: 0.012141
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.069890 | Grad Max: 0.069890
[GRADIENT NORM TOTAL] 4.1065

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52209735 0.47790262] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.005
[MASKS] A(Pass/Fail): 0/2048 | B: 105/1943 | C: 215/1833
[LOSS Ex1] A: 0.00000 | B: 0.68726 | C: 0.68221
[LOGITS Ex2 A] Mean Abs: 0.902 | Max: 3.745
[LOSS Ex2] A: 0.45396 | B: 0.49609 | C: 0.47111
** [JOINT LOSS] ** : 0.930208
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005543 | Grad Max: 0.151161
  -> Layer: shared_layers.0.bias | Grad Mean: 0.250835 | Grad Max: 1.557162
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001049 | Grad Max: 0.005431
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007039 | Grad Max: 0.007039
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002251 | Grad Max: 0.093746
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042921 | Grad Max: 0.492413
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000620 | Grad Max: 0.013350
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020214 | Grad Max: 0.070463
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000140 | Grad Max: 0.001694
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005340 | Grad Max: 0.012424
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000086 | Grad Max: 0.001099
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002322 | Grad Max: 0.006453
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.006738 | Grad Max: 0.016167
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.088769 | Grad Max: 0.088769
[GRADIENT NORM TOTAL] 5.2238

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.110
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50935894 0.49064106] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 0/1616 | B: 100/1948 | C: 223/1825
[LOSS Ex1] A: 0.00000 | B: 0.68565 | C: 0.68385
[LOGITS Ex2 A] Mean Abs: 0.903 | Max: 3.516
[LOSS Ex2] A: 0.42420 | B: 0.47981 | C: 0.46850
** [JOINT LOSS] ** : 0.914000
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004683 | Grad Max: 0.123911
  -> Layer: shared_layers.0.bias | Grad Mean: 0.219863 | Grad Max: 1.337774
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001400 | Grad Max: 0.005949
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022268 | Grad Max: 0.022268
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001935 | Grad Max: 0.078084
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037036 | Grad Max: 0.441926
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000538 | Grad Max: 0.010998
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017530 | Grad Max: 0.056661
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000121 | Grad Max: 0.001537
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004610 | Grad Max: 0.010907
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000074 | Grad Max: 0.000936
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002002 | Grad Max: 0.005594
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005767 | Grad Max: 0.013277
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.076761 | Grad Max: 0.076761
[GRADIENT NORM TOTAL] 4.5088

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.214
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.511722 0.488278] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 105/1751 | C: 200/1848
[LOSS Ex1] A: 0.68482 | B: 0.68726 | C: 0.68344
[LOGITS Ex2 A] Mean Abs: 0.820 | Max: 3.692
[LOSS Ex2] A: 0.42372 | B: 0.43989 | C: 0.47711
** [JOINT LOSS] ** : 1.132078
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001230 | Grad Max: 0.018586
  -> Layer: shared_layers.0.bias | Grad Mean: 0.036908 | Grad Max: 0.216045
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001880 | Grad Max: 0.010088
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017026 | Grad Max: 0.017026
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000342 | Grad Max: 0.013227
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006210 | Grad Max: 0.072556
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.002868
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002752 | Grad Max: 0.013271
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000338
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000714 | Grad Max: 0.002348
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000230
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000311 | Grad Max: 0.001197
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000804 | Grad Max: 0.002958
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011242 | Grad Max: 0.011242
[GRADIENT NORM TOTAL] 0.7704

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.210
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50578946 0.49421057] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 112/1936 | C: 220/1828
[LOSS Ex1] A: 0.68479 | B: 0.68695 | C: 0.68366
[LOGITS Ex2 A] Mean Abs: 0.813 | Max: 3.877
[LOSS Ex2] A: 0.44784 | B: 0.48633 | C: 0.48231
** [JOINT LOSS] ** : 1.157295
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004861 | Grad Max: 0.139294
  -> Layer: shared_layers.0.bias | Grad Mean: 0.221011 | Grad Max: 1.358294
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.011085
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022671 | Grad Max: 0.022671
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002006 | Grad Max: 0.074608
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037663 | Grad Max: 0.405886
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000540 | Grad Max: 0.014316
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017624 | Grad Max: 0.066064
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000121 | Grad Max: 0.001574
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004654 | Grad Max: 0.010870
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000075 | Grad Max: 0.000988
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002037 | Grad Max: 0.005871
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005996 | Grad Max: 0.012430
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.079544 | Grad Max: 0.079544
[GRADIENT NORM TOTAL] 4.5278

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.215
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105478 0.4894522] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 113/1935 | C: 218/1830
[LOSS Ex1] A: 0.68388 | B: 0.68716 | C: 0.68358
[LOGITS Ex2 A] Mean Abs: 0.822 | Max: 4.176
[LOSS Ex2] A: 0.46598 | B: 0.50427 | C: 0.48672
** [JOINT LOSS] ** : 1.170530
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006863 | Grad Max: 0.190868
  -> Layer: shared_layers.0.bias | Grad Mean: 0.313434 | Grad Max: 1.939338
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001918 | Grad Max: 0.010041
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014809 | Grad Max: 0.014809
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002782 | Grad Max: 0.106735
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053059 | Grad Max: 0.559461
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000767 | Grad Max: 0.018961
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025138 | Grad Max: 0.089918
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000172 | Grad Max: 0.002224
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006626 | Grad Max: 0.015590
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000106 | Grad Max: 0.001315
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002903 | Grad Max: 0.008089
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.008459 | Grad Max: 0.017960
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.114752 | Grad Max: 0.114752
[GRADIENT NORM TOTAL] 6.4081

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.192
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5119585  0.48804152] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 2/2046 | B: 102/1946 | C: 227/1821
[LOSS Ex1] A: 0.68499 | B: 0.68554 | C: 0.68381
[LOGITS Ex2 A] Mean Abs: 0.779 | Max: 3.875
[LOSS Ex2] A: 0.46735 | B: 0.47923 | C: 0.46638
** [JOINT LOSS] ** : 1.155766
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006310 | Grad Max: 0.181320
  -> Layer: shared_layers.0.bias | Grad Mean: 0.269654 | Grad Max: 1.636061
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001841 | Grad Max: 0.009389
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014638 | Grad Max: 0.014638
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002437 | Grad Max: 0.086809
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046006 | Grad Max: 0.487098
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000662 | Grad Max: 0.015199
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021613 | Grad Max: 0.078028
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000149 | Grad Max: 0.001827
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005698 | Grad Max: 0.013202
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000092 | Grad Max: 0.001157
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002488 | Grad Max: 0.006898
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.007320 | Grad Max: 0.014920
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.097530 | Grad Max: 0.097530
[GRADIENT NORM TOTAL] 5.5030

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.142
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5018524  0.49814767] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 107/1749 | C: 217/1831
[LOSS Ex1] A: 0.00000 | B: 0.68716 | C: 0.68164
[LOGITS Ex2 A] Mean Abs: 0.737 | Max: 3.724
[LOSS Ex2] A: 0.42436 | B: 0.44585 | C: 0.45967
** [JOINT LOSS] ** : 0.899563
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002730 | Grad Max: 0.071032
  -> Layer: shared_layers.0.bias | Grad Mean: 0.130315 | Grad Max: 0.816033
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001160 | Grad Max: 0.005277
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010099 | Grad Max: 0.010099
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001158 | Grad Max: 0.049153
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021901 | Grad Max: 0.263830
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000320 | Grad Max: 0.007655
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010524 | Grad Max: 0.037565
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000930
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002772 | Grad Max: 0.006862
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000556
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001201 | Grad Max: 0.003410
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003558 | Grad Max: 0.008716
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046569 | Grad Max: 0.046569
[GRADIENT NORM TOTAL] 2.6963

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.193
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5216402  0.47835988] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 114/1934 | C: 227/1821
[LOSS Ex1] A: 0.68484 | B: 0.68685 | C: 0.68207
[LOGITS Ex2 A] Mean Abs: 0.836 | Max: 3.727
[LOSS Ex2] A: 0.43711 | B: 0.48182 | C: 0.46025
** [JOINT LOSS] ** : 1.144317
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005706 | Grad Max: 0.180257
  -> Layer: shared_layers.0.bias | Grad Mean: 0.161535 | Grad Max: 0.935794
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001915 | Grad Max: 0.010117
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013690 | Grad Max: 0.013690
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001584 | Grad Max: 0.059072
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029362 | Grad Max: 0.296301
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.009199
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013141 | Grad Max: 0.049463
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001246
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003478 | Grad Max: 0.008547
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.000768
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001512 | Grad Max: 0.004492
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004392 | Grad Max: 0.009275
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057471 | Grad Max: 0.057471
[GRADIENT NORM TOTAL] 3.3693

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.216
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083936  0.49160644] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 115/1933 | C: 231/1817
[LOSS Ex1] A: 0.68572 | B: 0.68708 | C: 0.68073
[LOGITS Ex2 A] Mean Abs: 0.888 | Max: 3.399
[LOSS Ex2] A: 0.46682 | B: 0.48727 | C: 0.46756
** [JOINT LOSS] ** : 1.158392
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006869 | Grad Max: 0.206397
  -> Layer: shared_layers.0.bias | Grad Mean: 0.283724 | Grad Max: 1.706938
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001821 | Grad Max: 0.009194
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009682 | Grad Max: 0.009682
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002570 | Grad Max: 0.098984
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049026 | Grad Max: 0.559611
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000689 | Grad Max: 0.014997
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022494 | Grad Max: 0.078575
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000154 | Grad Max: 0.001976
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005924 | Grad Max: 0.014101
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000095 | Grad Max: 0.001183
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002576 | Grad Max: 0.006993
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.007432 | Grad Max: 0.017181
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.098482 | Grad Max: 0.098482
[GRADIENT NORM TOTAL] 5.8148

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5229239 0.4770761] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 109/1939 | C: 220/1828
[LOSS Ex1] A: 0.00000 | B: 0.68545 | C: 0.68153
[LOGITS Ex2 A] Mean Abs: 0.889 | Max: 3.766
[LOSS Ex2] A: 0.45225 | B: 0.47664 | C: 0.44827
** [JOINT LOSS] ** : 0.914716
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006391 | Grad Max: 0.172901
  -> Layer: shared_layers.0.bias | Grad Mean: 0.243179 | Grad Max: 1.462826
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001145 | Grad Max: 0.005549
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003044 | Grad Max: 0.003044
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002219 | Grad Max: 0.085078
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041697 | Grad Max: 0.466138
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000579 | Grad Max: 0.012664
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018867 | Grad Max: 0.064823
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000129 | Grad Max: 0.001694
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004989 | Grad Max: 0.012402
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000079 | Grad Max: 0.000982
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002171 | Grad Max: 0.005962
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.006256 | Grad Max: 0.014290
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.082915 | Grad Max: 0.082915
[GRADIENT NORM TOTAL] 4.9423

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.110
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5099776 0.4900224] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/1616 | B: 110/1746 | C: 219/1829
[LOSS Ex1] A: 0.00000 | B: 0.68708 | C: 0.68260
[LOGITS Ex2 A] Mean Abs: 0.852 | Max: 3.680
[LOSS Ex2] A: 0.41525 | B: 0.43874 | C: 0.43307
** [JOINT LOSS] ** : 0.885577
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003399 | Grad Max: 0.088965
  -> Layer: shared_layers.0.bias | Grad Mean: 0.104238 | Grad Max: 0.633965
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001122 | Grad Max: 0.005241
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013392 | Grad Max: 0.013392
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000975 | Grad Max: 0.042724
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018482 | Grad Max: 0.219247
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000259 | Grad Max: 0.006552
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008410 | Grad Max: 0.031115
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000764
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002213 | Grad Max: 0.005451
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000545
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000966 | Grad Max: 0.003070
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002588 | Grad Max: 0.006325
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036158 | Grad Max: 0.036158
[GRADIENT NORM TOTAL] 2.1506

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.216
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51150197 0.48849797] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 117/1931 | C: 214/1834
[LOSS Ex1] A: 0.68466 | B: 0.68677 | C: 0.68260
[LOGITS Ex2 A] Mean Abs: 0.803 | Max: 3.830
[LOSS Ex2] A: 0.42544 | B: 0.47536 | C: 0.46184
** [JOINT LOSS] ** : 1.138886
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003784 | Grad Max: 0.095331
  -> Layer: shared_layers.0.bias | Grad Mean: 0.149209 | Grad Max: 0.897146
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001878 | Grad Max: 0.009959
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014470 | Grad Max: 0.014470
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001378 | Grad Max: 0.055358
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025743 | Grad Max: 0.306408
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.008053
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011958 | Grad Max: 0.039609
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.001108
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003163 | Grad Max: 0.007804
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000695
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001384 | Grad Max: 0.003995
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004096 | Grad Max: 0.009501
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.054172 | Grad Max: 0.054172
[GRADIENT NORM TOTAL] 3.0632

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.212
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50549144 0.49450853] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 118/1930 | C: 144/1232
[LOSS Ex1] A: 0.68462 | B: 0.68700 | C: 0.68231
[LOGITS Ex2 A] Mean Abs: 0.804 | Max: 3.758
[LOSS Ex2] A: 0.44342 | B: 0.47390 | C: 0.46535
** [JOINT LOSS] ** : 1.145531
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005507 | Grad Max: 0.152996
  -> Layer: shared_layers.0.bias | Grad Mean: 0.247635 | Grad Max: 1.501233
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001958 | Grad Max: 0.010114
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015511 | Grad Max: 0.015511
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.084958
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041942 | Grad Max: 0.477203
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000599 | Grad Max: 0.014287
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019690 | Grad Max: 0.064640
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000133 | Grad Max: 0.001805
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005199 | Grad Max: 0.012320
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000081 | Grad Max: 0.001054
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002272 | Grad Max: 0.006416
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.006443 | Grad Max: 0.013997
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.088202 | Grad Max: 0.088202
[GRADIENT NORM TOTAL] 5.0311

[EPOCH SUMMARY] Train Loss: 1.0643

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.1181 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 14/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.217
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51031023 0.48968974] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 110/1938 | C: 205/1843
[LOSS Ex1] A: 0.68372 | B: 0.68537 | C: 0.68327
[LOGITS Ex2 A] Mean Abs: 0.796 | Max: 3.507
[LOSS Ex2] A: 0.43886 | B: 0.44437 | C: 0.46836
** [JOINT LOSS] ** : 1.134649
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004506 | Grad Max: 0.109250
  -> Layer: shared_layers.0.bias | Grad Mean: 0.208138 | Grad Max: 1.270838
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002026 | Grad Max: 0.010681
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014983 | Grad Max: 0.014983
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001832 | Grad Max: 0.069294
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034732 | Grad Max: 0.383120
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000498 | Grad Max: 0.011467
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016395 | Grad Max: 0.056857
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000110 | Grad Max: 0.001470
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004323 | Grad Max: 0.010442
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000068 | Grad Max: 0.000901
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001887 | Grad Max: 0.005301
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005436 | Grad Max: 0.011317
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.073409 | Grad Max: 0.073409
[GRADIENT NORM TOTAL] 4.2067

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.194
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5117516 0.4882484] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 2/2046 | B: 113/1743 | C: 218/1830
[LOSS Ex1] A: 0.68488 | B: 0.68700 | C: 0.68421
[LOGITS Ex2 A] Mean Abs: 0.794 | Max: 3.673
[LOSS Ex2] A: 0.42349 | B: 0.44672 | C: 0.44593
** [JOINT LOSS] ** : 1.124075
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002292 | Grad Max: 0.060297
  -> Layer: shared_layers.0.bias | Grad Mean: 0.063031 | Grad Max: 0.359091
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001791 | Grad Max: 0.009413
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015835 | Grad Max: 0.015835
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000609 | Grad Max: 0.023092
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011034 | Grad Max: 0.126433
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000159 | Grad Max: 0.003934
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005107 | Grad Max: 0.017536
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000544
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001361 | Grad Max: 0.003800
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000328
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000587 | Grad Max: 0.001895
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001792 | Grad Max: 0.005263
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022736 | Grad Max: 0.022736
[GRADIENT NORM TOTAL] 1.3083

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.142
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022183 0.4977817] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 122/1926 | C: 216/1832
[LOSS Ex1] A: 0.00000 | B: 0.68669 | C: 0.68220
[LOGITS Ex2 A] Mean Abs: 0.846 | Max: 3.569
[LOSS Ex2] A: 0.43383 | B: 0.48889 | C: 0.45948
** [JOINT LOSS] ** : 0.917033
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005025 | Grad Max: 0.134538
  -> Layer: shared_layers.0.bias | Grad Mean: 0.201530 | Grad Max: 1.220197
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001066 | Grad Max: 0.005151
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008190 | Grad Max: 0.008190
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001790 | Grad Max: 0.070422
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033742 | Grad Max: 0.383758
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000469 | Grad Max: 0.010581
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015325 | Grad Max: 0.053599
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001516
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004048 | Grad Max: 0.010061
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000064 | Grad Max: 0.000849
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001763 | Grad Max: 0.004997
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004978 | Grad Max: 0.011714
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.066946 | Grad Max: 0.066946
[GRADIENT NORM TOTAL] 4.0725

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.195
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52235824 0.47764176] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 122/1926 | C: 192/1856
[LOSS Ex1] A: 0.68469 | B: 0.68692 | C: 0.68382
[LOGITS Ex2 A] Mean Abs: 0.931 | Max: 3.794
[LOSS Ex2] A: 0.46877 | B: 0.51097 | C: 0.46460
** [JOINT LOSS] ** : 1.166592
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007681 | Grad Max: 0.227402
  -> Layer: shared_layers.0.bias | Grad Mean: 0.341362 | Grad Max: 2.062739
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001961 | Grad Max: 0.010347
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019807 | Grad Max: 0.019807
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003007 | Grad Max: 0.112736
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056895 | Grad Max: 0.628586
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000798 | Grad Max: 0.017852
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026126 | Grad Max: 0.089403
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000175 | Grad Max: 0.002205
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006883 | Grad Max: 0.016716
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000108 | Grad Max: 0.001408
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003002 | Grad Max: 0.008336
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.008699 | Grad Max: 0.020364
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.115615 | Grad Max: 0.115615
[GRADIENT NORM TOTAL] 6.8893

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.218
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081861  0.49181384] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 114/1934 | C: 229/1819
[LOSS Ex1] A: 0.68560 | B: 0.68529 | C: 0.67987
[LOGITS Ex2 A] Mean Abs: 0.918 | Max: 3.730
[LOSS Ex2] A: 0.46799 | B: 0.49862 | C: 0.46360
** [JOINT LOSS] ** : 1.160321
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006540 | Grad Max: 0.185130
  -> Layer: shared_layers.0.bias | Grad Mean: 0.322500 | Grad Max: 1.957633
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.009015
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004172 | Grad Max: 0.004172
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002775 | Grad Max: 0.105910
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053222 | Grad Max: 0.600234
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000748 | Grad Max: 0.016179
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024661 | Grad Max: 0.081699
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000164 | Grad Max: 0.002151
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006491 | Grad Max: 0.015790
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000101 | Grad Max: 0.001252
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002831 | Grad Max: 0.007824
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.008014 | Grad Max: 0.019399
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.107948 | Grad Max: 0.107948
[GRADIENT NORM TOTAL] 6.4957

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52354157 0.4764584 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 118/1738 | C: 213/1835
[LOSS Ex1] A: 0.00000 | B: 0.68692 | C: 0.68292
[LOGITS Ex2 A] Mean Abs: 0.862 | Max: 3.594
[LOSS Ex2] A: 0.42852 | B: 0.46403 | C: 0.45134
** [JOINT LOSS] ** : 0.904575
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004236 | Grad Max: 0.110307
  -> Layer: shared_layers.0.bias | Grad Mean: 0.198989 | Grad Max: 1.197965
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001208 | Grad Max: 0.004975
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016457 | Grad Max: 0.016457
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001691 | Grad Max: 0.065662
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032252 | Grad Max: 0.371374
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000457 | Grad Max: 0.010010
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015066 | Grad Max: 0.049603
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001386
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003965 | Grad Max: 0.009854
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000062 | Grad Max: 0.000807
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001737 | Grad Max: 0.004829
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004929 | Grad Max: 0.010441
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.067084 | Grad Max: 0.067084
[GRADIENT NORM TOTAL] 3.9659

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.111
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51045907 0.4895409 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/1616 | B: 123/1925 | C: 226/1822
[LOSS Ex1] A: 0.00000 | B: 0.68662 | C: 0.68302
[LOGITS Ex2 A] Mean Abs: 0.822 | Max: 3.743
[LOSS Ex2] A: 0.41318 | B: 0.46577 | C: 0.44520
** [JOINT LOSS] ** : 0.897930
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002345 | Grad Max: 0.067573
  -> Layer: shared_layers.0.bias | Grad Mean: 0.073075 | Grad Max: 0.442686
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001162 | Grad Max: 0.005136
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014915 | Grad Max: 0.014915
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000726 | Grad Max: 0.031449
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013173 | Grad Max: 0.170650
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000180 | Grad Max: 0.004033
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005834 | Grad Max: 0.020782
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000627
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001550 | Grad Max: 0.004251
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000360
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000679 | Grad Max: 0.002024
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002065 | Grad Max: 0.005202
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026725 | Grad Max: 0.026725
[GRADIENT NORM TOTAL] 1.5344

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.219
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5113187  0.48868135] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 123/1925 | C: 223/1825
[LOSS Ex1] A: 0.68453 | B: 0.68685 | C: 0.68284
[LOGITS Ex2 A] Mean Abs: 0.818 | Max: 3.917
[LOSS Ex2] A: 0.41762 | B: 0.45860 | C: 0.45875
** [JOINT LOSS] ** : 1.129728
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003966 | Grad Max: 0.104923
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141135 | Grad Max: 0.845677
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.009900
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015326 | Grad Max: 0.015326
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001322 | Grad Max: 0.052380
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024817 | Grad Max: 0.289540
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000344 | Grad Max: 0.007901
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011285 | Grad Max: 0.039932
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.001015
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002982 | Grad Max: 0.007221
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000635
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001302 | Grad Max: 0.003648
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003811 | Grad Max: 0.009144
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050725 | Grad Max: 0.050725
[GRADIENT NORM TOTAL] 2.8945

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.214
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.505235   0.49476495] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 115/1933 | C: 237/1811
[LOSS Ex1] A: 0.68448 | B: 0.68521 | C: 0.68193
[LOGITS Ex2 A] Mean Abs: 0.807 | Max: 3.809
[LOSS Ex2] A: 0.41943 | B: 0.45252 | C: 0.45841
** [JOINT LOSS] ** : 1.127328
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003530 | Grad Max: 0.091850
  -> Layer: shared_layers.0.bias | Grad Mean: 0.123736 | Grad Max: 0.748238
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001999 | Grad Max: 0.010457
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014806 | Grad Max: 0.014806
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001175 | Grad Max: 0.046133
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021650 | Grad Max: 0.260669
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000303 | Grad Max: 0.006507
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009874 | Grad Max: 0.030570
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000896
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002624 | Grad Max: 0.006472
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000532
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001148 | Grad Max: 0.003164
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003411 | Grad Max: 0.007891
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045300 | Grad Max: 0.045300
[GRADIENT NORM TOTAL] 2.5480

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.219
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51009023 0.4899098 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 3/2045 | B: 119/1737 | C: 223/1825
[LOSS Ex1] A: 0.68359 | B: 0.68684 | C: 0.68178
[LOGITS Ex2 A] Mean Abs: 0.818 | Max: 3.768
[LOSS Ex2] A: 0.42231 | B: 0.44768 | C: 0.44023
** [JOINT LOSS] ** : 1.120811
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001926 | Grad Max: 0.050477
  -> Layer: shared_layers.0.bias | Grad Mean: 0.030111 | Grad Max: 0.153235
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001911 | Grad Max: 0.009961
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008733 | Grad Max: 0.008733
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000337 | Grad Max: 0.013506
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005878 | Grad Max: 0.075346
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.003644
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002440 | Grad Max: 0.013734
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000325
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000644 | Grad Max: 0.001994
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000223
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000286 | Grad Max: 0.001096
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000839 | Grad Max: 0.003144
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010603 | Grad Max: 0.010603
[GRADIENT NORM TOTAL] 0.6822

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.195
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51154274 0.48845732] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 2/2046 | B: 123/1925 | C: 235/1813
[LOSS Ex1] A: 0.68479 | B: 0.68654 | C: 0.68167
[LOGITS Ex2 A] Mean Abs: 0.823 | Max: 3.897
[LOSS Ex2] A: 0.41593 | B: 0.47618 | C: 0.45664
** [JOINT LOSS] ** : 1.133914
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002424 | Grad Max: 0.050194
  -> Layer: shared_layers.0.bias | Grad Mean: 0.107231 | Grad Max: 0.634212
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001795 | Grad Max: 0.009074
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009501 | Grad Max: 0.009501
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000925 | Grad Max: 0.040238
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017622 | Grad Max: 0.230344
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000246 | Grad Max: 0.005977
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008172 | Grad Max: 0.029281
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000703
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002145 | Grad Max: 0.005244
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000501
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000936 | Grad Max: 0.002795
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002416 | Grad Max: 0.005485
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034603 | Grad Max: 0.034603
[GRADIENT NORM TOTAL] 2.1553

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.142
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.502561   0.49743906] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 124/1924 | C: 223/1825
[LOSS Ex1] A: 0.00000 | B: 0.68677 | C: 0.68199
[LOGITS Ex2 A] Mean Abs: 0.789 | Max: 3.807
[LOSS Ex2] A: 0.41852 | B: 0.45629 | C: 0.45369
** [JOINT LOSS] ** : 0.899084
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001665 | Grad Max: 0.038104
  -> Layer: shared_layers.0.bias | Grad Mean: 0.044771 | Grad Max: 0.234484
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001155 | Grad Max: 0.005166
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012895 | Grad Max: 0.012895
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000416 | Grad Max: 0.017086
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007818 | Grad Max: 0.097869
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000106 | Grad Max: 0.003127
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003406 | Grad Max: 0.012596
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000355
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000891 | Grad Max: 0.002350
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000233
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000391 | Grad Max: 0.001285
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001017 | Grad Max: 0.003526
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014603 | Grad Max: 0.014603
[GRADIENT NORM TOTAL] 0.9211

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.197
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.523057 0.476943] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 117/1931 | C: 223/1825
[LOSS Ex1] A: 0.68454 | B: 0.68511 | C: 0.68119
[LOGITS Ex2 A] Mean Abs: 0.811 | Max: 4.026
[LOSS Ex2] A: 0.41130 | B: 0.44828 | C: 0.44990
** [JOINT LOSS] ** : 1.120104
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003477 | Grad Max: 0.088827
  -> Layer: shared_layers.0.bias | Grad Mean: 0.117063 | Grad Max: 0.706028
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002048 | Grad Max: 0.010576
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016367 | Grad Max: 0.016367
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001107 | Grad Max: 0.043646
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020441 | Grad Max: 0.228140
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.007659
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009236 | Grad Max: 0.038722
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000804
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002444 | Grad Max: 0.006053
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000575
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001065 | Grad Max: 0.003236
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003230 | Grad Max: 0.008097
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041788 | Grad Max: 0.041788
[GRADIENT NORM TOTAL] 2.4028

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.221
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5079726  0.49202734] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 3/2045 | B: 123/1733 | C: 161/1215
[LOSS Ex1] A: 0.68549 | B: 0.68675 | C: 0.68155
[LOGITS Ex2 A] Mean Abs: 0.805 | Max: 3.687
[LOSS Ex2] A: 0.42106 | B: 0.44541 | C: 0.42835
** [JOINT LOSS] ** : 1.116197
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004365 | Grad Max: 0.139549
  -> Layer: shared_layers.0.bias | Grad Mean: 0.139533 | Grad Max: 0.829994
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001797 | Grad Max: 0.009246
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013727 | Grad Max: 0.013727
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001400 | Grad Max: 0.056489
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025738 | Grad Max: 0.298001
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000358 | Grad Max: 0.008807
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011673 | Grad Max: 0.041274
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001108
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003090 | Grad Max: 0.007734
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000634
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001337 | Grad Max: 0.003851
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003874 | Grad Max: 0.008774
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050594 | Grad Max: 0.050594
[GRADIENT NORM TOTAL] 2.9847

[EPOCH SUMMARY] Train Loss: 1.0680

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0980 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.1089 -> New: 1.0980)

############################## EPOCH 15/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.524183 0.475817] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 126/1922 | C: 229/1819
[LOSS Ex1] A: 0.00000 | B: 0.68643 | C: 0.68219
[LOGITS Ex2 A] Mean Abs: 0.813 | Max: 3.629
[LOSS Ex2] A: 0.41991 | B: 0.46469 | C: 0.45796
** [JOINT LOSS] ** : 0.903726
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001875 | Grad Max: 0.054406
  -> Layer: shared_layers.0.bias | Grad Mean: 0.045119 | Grad Max: 0.246264
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001127 | Grad Max: 0.005098
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011149 | Grad Max: 0.011149
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000467 | Grad Max: 0.023562
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008417 | Grad Max: 0.119230
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000116 | Grad Max: 0.002886
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003816 | Grad Max: 0.012512
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000429
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001025 | Grad Max: 0.002976
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000269
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000448 | Grad Max: 0.001612
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001381 | Grad Max: 0.003543
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018061 | Grad Max: 0.018061
[GRADIENT NORM TOTAL] 0.9825

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.112
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51095027 0.48904976] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/1616 | B: 126/1922 | C: 232/1816
[LOSS Ex1] A: 0.00000 | B: 0.68666 | C: 0.68137
[LOGITS Ex2 A] Mean Abs: 0.901 | Max: 3.543
[LOSS Ex2] A: 0.40658 | B: 0.47687 | C: 0.44158
** [JOINT LOSS] ** : 0.897685
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003656 | Grad Max: 0.095113
  -> Layer: shared_layers.0.bias | Grad Mean: 0.165471 | Grad Max: 0.983206
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001102 | Grad Max: 0.005238
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010056 | Grad Max: 0.010056
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001400 | Grad Max: 0.057887
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026895 | Grad Max: 0.331551
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000368 | Grad Max: 0.007726
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012188 | Grad Max: 0.040090
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.001004
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003185 | Grad Max: 0.007559
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000687
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001379 | Grad Max: 0.004206
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003768 | Grad Max: 0.007710
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051996 | Grad Max: 0.051996
[GRADIENT NORM TOTAL] 3.3040

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.222
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.511127 0.488873] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 122/1926 | C: 212/1836
[LOSS Ex1] A: 0.68439 | B: 0.68499 | C: 0.68132
[LOGITS Ex2 A] Mean Abs: 0.921 | Max: 3.644
[LOSS Ex2] A: 0.43437 | B: 0.47240 | C: 0.46133
** [JOINT LOSS] ** : 1.139600
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006272 | Grad Max: 0.162702
  -> Layer: shared_layers.0.bias | Grad Mean: 0.269086 | Grad Max: 1.610377
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.009855
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011943 | Grad Max: 0.011943
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002384 | Grad Max: 0.094208
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045219 | Grad Max: 0.541097
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000622 | Grad Max: 0.014403
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020522 | Grad Max: 0.070652
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000134 | Grad Max: 0.001768
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005358 | Grad Max: 0.012863
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000082 | Grad Max: 0.001058
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002317 | Grad Max: 0.006352
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.006632 | Grad Max: 0.014726
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.088396 | Grad Max: 0.088396
[GRADIENT NORM TOTAL] 5.4526

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.217
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5049488 0.4950512] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 1/2047 | B: 126/1730 | C: 227/1821
[LOSS Ex1] A: 0.68433 | B: 0.68664 | C: 0.68041
[LOGITS Ex2 A] Mean Abs: 0.886 | Max: 3.672
[LOSS Ex2] A: 0.43255 | B: 0.45696 | C: 0.43904
** [JOINT LOSS] ** : 1.126643
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004443 | Grad Max: 0.121797
  -> Layer: shared_layers.0.bias | Grad Mean: 0.182630 | Grad Max: 1.100712
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001930 | Grad Max: 0.009914
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009528 | Grad Max: 0.009528
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001614 | Grad Max: 0.066436
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030669 | Grad Max: 0.365904
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000421 | Grad Max: 0.009445
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013959 | Grad Max: 0.049194
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001119
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003644 | Grad Max: 0.008869
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000055 | Grad Max: 0.000754
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001574 | Grad Max: 0.004381
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004356 | Grad Max: 0.009856
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.059141 | Grad Max: 0.059141
[GRADIENT NORM TOTAL] 3.6998

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.222
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5098835 0.4901165] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 4/2044 | B: 127/1921 | C: 197/1851
[LOSS Ex1] A: 0.68344 | B: 0.68633 | C: 0.68492
[LOGITS Ex2 A] Mean Abs: 0.833 | Max: 3.726
[LOSS Ex2] A: 0.41380 | B: 0.46789 | C: 0.44870
** [JOINT LOSS] ** : 1.128356
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.068573
  -> Layer: shared_layers.0.bias | Grad Mean: 0.025011 | Grad Max: 0.111235
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.010936
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.021189 | Grad Max: 0.021189
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000328 | Grad Max: 0.035112
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005351 | Grad Max: 0.194873
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000064 | Grad Max: 0.002742
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001850 | Grad Max: 0.012712
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000287
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000471 | Grad Max: 0.001541
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000188
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000205 | Grad Max: 0.000865
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000593 | Grad Max: 0.002535
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007246 | Grad Max: 0.007246
[GRADIENT NORM TOTAL] 0.6686

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.198
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51133305 0.48866695] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 2/2046 | B: 129/1919 | C: 217/1831
[LOSS Ex1] A: 0.68468 | B: 0.68656 | C: 0.68239
[LOGITS Ex2 A] Mean Abs: 0.810 | Max: 3.886
[LOSS Ex2] A: 0.43080 | B: 0.45952 | C: 0.45868
** [JOINT LOSS] ** : 1.134207
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004942 | Grad Max: 0.145242
  -> Layer: shared_layers.0.bias | Grad Mean: 0.212113 | Grad Max: 1.295845
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001687 | Grad Max: 0.008271
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003929 | Grad Max: 0.003929
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001908 | Grad Max: 0.088786
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036259 | Grad Max: 0.484962
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000506 | Grad Max: 0.010945
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016781 | Grad Max: 0.059343
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000109 | Grad Max: 0.001501
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004379 | Grad Max: 0.010371
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000066 | Grad Max: 0.000866
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001888 | Grad Max: 0.005128
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005430 | Grad Max: 0.011634
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.072743 | Grad Max: 0.072743
[GRADIENT NORM TOTAL] 4.3527

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.143
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029498  0.49705023] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.510 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 123/1925 | C: 243/1805
[LOSS Ex1] A: 0.00000 | B: 0.68488 | C: 0.67945
[LOGITS Ex2 A] Mean Abs: 0.790 | Max: 4.007
[LOSS Ex2] A: 0.45712 | B: 0.45712 | C: 0.46716
** [JOINT LOSS] ** : 0.915241
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006780 | Grad Max: 0.183198
  -> Layer: shared_layers.0.bias | Grad Mean: 0.272646 | Grad Max: 1.637387
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001276 | Grad Max: 0.005693
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007305 | Grad Max: 0.007305
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002493 | Grad Max: 0.092812
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047029 | Grad Max: 0.536257
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000655 | Grad Max: 0.014189
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021664 | Grad Max: 0.073138
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000141 | Grad Max: 0.001813
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005646 | Grad Max: 0.013761
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000086 | Grad Max: 0.001064
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002435 | Grad Max: 0.006552
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.007032 | Grad Max: 0.014172
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.093909 | Grad Max: 0.093909
[GRADIENT NORM TOTAL] 5.5698

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.200
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5238307  0.47616932] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 129/1727 | C: 227/1821
[LOSS Ex1] A: 0.68438 | B: 0.68653 | C: 0.68146
[LOGITS Ex2 A] Mean Abs: 0.837 | Max: 3.863
[LOSS Ex2] A: 0.41381 | B: 0.45280 | C: 0.46660
** [JOINT LOSS] ** : 1.128529
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004911 | Grad Max: 0.147203
  -> Layer: shared_layers.0.bias | Grad Mean: 0.197985 | Grad Max: 1.166105
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001930 | Grad Max: 0.010298
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016511 | Grad Max: 0.016511
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001824 | Grad Max: 0.076258
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034339 | Grad Max: 0.444724
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.010382
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015851 | Grad Max: 0.053935
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000103 | Grad Max: 0.001305
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004143 | Grad Max: 0.009849
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000063 | Grad Max: 0.000839
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001786 | Grad Max: 0.004959
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005176 | Grad Max: 0.011118
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.069346 | Grad Max: 0.069346
[GRADIENT NORM TOTAL] 4.0759

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.224
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5077299  0.49227008] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 3/2045 | B: 127/1921 | C: 226/1822
[LOSS Ex1] A: 0.68536 | B: 0.68622 | C: 0.68203
[LOGITS Ex2 A] Mean Abs: 0.831 | Max: 3.772
[LOSS Ex2] A: 0.40546 | B: 0.46067 | C: 0.42191
** [JOINT LOSS] ** : 1.113883
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.052642
  -> Layer: shared_layers.0.bias | Grad Mean: 0.019749 | Grad Max: 0.068982
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001856 | Grad Max: 0.009606
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015081 | Grad Max: 0.015081
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000281 | Grad Max: 0.020232
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004055 | Grad Max: 0.101093
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.001747
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001265 | Grad Max: 0.008387
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000288
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000344 | Grad Max: 0.001773
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000154
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000139 | Grad Max: 0.000694
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000886 | Grad Max: 0.002874
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005118 | Grad Max: 0.005118
[GRADIENT NORM TOTAL] 0.5379

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.524884   0.47511595] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/2048 | B: 131/1917 | C: 236/1812
[LOSS Ex1] A: 0.00000 | B: 0.68645 | C: 0.68282
[LOGITS Ex2 A] Mean Abs: 0.913 | Max: 3.748
[LOSS Ex2] A: 0.43396 | B: 0.47900 | C: 0.45125
** [JOINT LOSS] ** : 0.911164
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004729 | Grad Max: 0.121480
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207725 | Grad Max: 1.284658
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001244 | Grad Max: 0.005050
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017454 | Grad Max: 0.017454
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001833 | Grad Max: 0.082115
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034886 | Grad Max: 0.429885
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000484 | Grad Max: 0.010176
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016052 | Grad Max: 0.052456
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000103 | Grad Max: 0.001332
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004153 | Grad Max: 0.009862
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000063 | Grad Max: 0.000842
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001780 | Grad Max: 0.004797
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005054 | Grad Max: 0.011616
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.067754 | Grad Max: 0.067754
[GRADIENT NORM TOTAL] 4.2740

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.115
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.511518   0.48848203] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.006
[MASKS] A(Pass/Fail): 0/1616 | B: 123/1925 | C: 221/1827
[LOSS Ex1] A: 0.00000 | B: 0.68477 | C: 0.68193
[LOGITS Ex2 A] Mean Abs: 0.966 | Max: 3.913
[LOSS Ex2] A: 0.42710 | B: 0.47495 | C: 0.45052
** [JOINT LOSS] ** : 0.906420
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006219 | Grad Max: 0.179009
  -> Layer: shared_layers.0.bias | Grad Mean: 0.269701 | Grad Max: 1.647094
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001184 | Grad Max: 0.005321
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013013 | Grad Max: 0.013013
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002389 | Grad Max: 0.104021
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045675 | Grad Max: 0.546358
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000629 | Grad Max: 0.014190
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020855 | Grad Max: 0.069593
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000134 | Grad Max: 0.001714
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005400 | Grad Max: 0.012614
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000081 | Grad Max: 0.001057
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002304 | Grad Max: 0.006476
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.006516 | Grad Max: 0.014732
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.086683 | Grad Max: 0.086683
[GRADIENT NORM TOTAL] 5.5142

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.225
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108996 0.4891004] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 130/1726 | C: 238/1810
[LOSS Ex1] A: 0.68424 | B: 0.68643 | C: 0.68162
[LOGITS Ex2 A] Mean Abs: 0.914 | Max: 3.843
[LOSS Ex2] A: 0.43530 | B: 0.45927 | C: 0.43118
** [JOINT LOSS] ** : 1.126011
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005624 | Grad Max: 0.149093
  -> Layer: shared_layers.0.bias | Grad Mean: 0.208167 | Grad Max: 1.265968
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001948 | Grad Max: 0.010410
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018221 | Grad Max: 0.018221
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001909 | Grad Max: 0.078900
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036033 | Grad Max: 0.435985
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000490 | Grad Max: 0.011118
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016207 | Grad Max: 0.054914
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000105 | Grad Max: 0.001260
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004201 | Grad Max: 0.009635
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000064 | Grad Max: 0.000880
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001794 | Grad Max: 0.005035
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005151 | Grad Max: 0.011869
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.068487 | Grad Max: 0.068487
[GRADIENT NORM TOTAL] 4.2644

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.220
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5046276  0.49537238] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 128/1920 | C: 209/1839
[LOSS Ex1] A: 0.68416 | B: 0.68612 | C: 0.68342
[LOGITS Ex2 A] Mean Abs: 0.856 | Max: 3.720
[LOSS Ex2] A: 0.40550 | B: 0.46512 | C: 0.43975
** [JOINT LOSS] ** : 1.121355
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001493 | Grad Max: 0.038116
  -> Layer: shared_layers.0.bias | Grad Mean: 0.010417 | Grad Max: 0.076368
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.010289
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016749 | Grad Max: 0.016749
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000182 | Grad Max: 0.014433
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002498 | Grad Max: 0.080229
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000030 | Grad Max: 0.002178
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000614 | Grad Max: 0.004940
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000184
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000136 | Grad Max: 0.000994
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000104
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000059 | Grad Max: 0.000406
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000592 | Grad Max: 0.001986
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000707 | Grad Max: 0.000707
[GRADIENT NORM TOTAL] 0.3535

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.224
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50958896 0.490411  ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 4/2044 | B: 135/1913 | C: 133/1243
[LOSS Ex1] A: 0.68326 | B: 0.68636 | C: 0.68330
[LOGITS Ex2 A] Mean Abs: 0.831 | Max: 3.813
[LOSS Ex2] A: 0.41506 | B: 0.45669 | C: 0.47042
** [JOINT LOSS] ** : 1.131695
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003122 | Grad Max: 0.094528
  -> Layer: shared_layers.0.bias | Grad Mean: 0.160746 | Grad Max: 1.010098
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001920 | Grad Max: 0.010324
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015929 | Grad Max: 0.015929
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001390 | Grad Max: 0.058448
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026419 | Grad Max: 0.319131
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000367 | Grad Max: 0.008674
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012255 | Grad Max: 0.044864
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000995
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003155 | Grad Max: 0.007567
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000629
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001354 | Grad Max: 0.003908
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003934 | Grad Max: 0.008758
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052981 | Grad Max: 0.052981
[GRADIENT NORM TOTAL] 3.2790

[EPOCH SUMMARY] Train Loss: 1.0489

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.1015 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 16/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.200
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51105624 0.48894376] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 2/2046 | B: 124/1924 | C: 203/1845
[LOSS Ex1] A: 0.68455 | B: 0.68467 | C: 0.68411
[LOGITS Ex2 A] Mean Abs: 0.827 | Max: 3.921
[LOSS Ex2] A: 0.43074 | B: 0.44308 | C: 0.47362
** [JOINT LOSS] ** : 1.133587
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004487 | Grad Max: 0.132604
  -> Layer: shared_layers.0.bias | Grad Mean: 0.179252 | Grad Max: 1.065669
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001715 | Grad Max: 0.008490
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009162 | Grad Max: 0.009162
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001612 | Grad Max: 0.061179
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030387 | Grad Max: 0.339816
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000421 | Grad Max: 0.010141
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013990 | Grad Max: 0.052960
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001180
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003624 | Grad Max: 0.008381
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.000736
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001549 | Grad Max: 0.004320
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004508 | Grad Max: 0.009440
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.059915 | Grad Max: 0.059915
[GRADIENT NORM TOTAL] 3.6228

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.144
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034008  0.49659914] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 0/2048 | B: 132/1724 | C: 231/1817
[LOSS Ex1] A: 0.00000 | B: 0.68633 | C: 0.68203
[LOGITS Ex2 A] Mean Abs: 0.803 | Max: 3.944
[LOSS Ex2] A: 0.40524 | B: 0.44198 | C: 0.43917
** [JOINT LOSS] ** : 0.884918
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002295 | Grad Max: 0.071360
  -> Layer: shared_layers.0.bias | Grad Mean: 0.091014 | Grad Max: 0.539404
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001182 | Grad Max: 0.005291
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013430 | Grad Max: 0.013430
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000841 | Grad Max: 0.039313
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015776 | Grad Max: 0.227533
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.005422
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007428 | Grad Max: 0.025816
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000735
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001936 | Grad Max: 0.004825
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000388
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000822 | Grad Max: 0.002284
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002440 | Grad Max: 0.006008
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031566 | Grad Max: 0.031566
[GRADIENT NORM TOTAL] 1.9216

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.202
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5247694  0.47523054] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 128/1920 | C: 233/1815
[LOSS Ex1] A: 0.68419 | B: 0.68601 | C: 0.68034
[LOGITS Ex2 A] Mean Abs: 0.889 | Max: 4.247
[LOSS Ex2] A: 0.42121 | B: 0.48323 | C: 0.43635
** [JOINT LOSS] ** : 1.130441
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004491 | Grad Max: 0.123723
  -> Layer: shared_layers.0.bias | Grad Mean: 0.160572 | Grad Max: 0.970658
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001976 | Grad Max: 0.010220
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012871 | Grad Max: 0.012871
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001460 | Grad Max: 0.065423
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027273 | Grad Max: 0.364945
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000371 | Grad Max: 0.008141
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012190 | Grad Max: 0.042043
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.001042
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003146 | Grad Max: 0.007951
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000689
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001335 | Grad Max: 0.003885
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003765 | Grad Max: 0.007837
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.049801 | Grad Max: 0.049801
[GRADIENT NORM TOTAL] 3.2879

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.227
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.507427 0.492573] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 3/2045 | B: 136/1912 | C: 239/1809
[LOSS Ex1] A: 0.68521 | B: 0.68626 | C: 0.68083
[LOGITS Ex2 A] Mean Abs: 0.932 | Max: 3.791
[LOSS Ex2] A: 0.43016 | B: 0.48678 | C: 0.45217
** [JOINT LOSS] ** : 1.140468
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005027 | Grad Max: 0.147015
  -> Layer: shared_layers.0.bias | Grad Mean: 0.254324 | Grad Max: 1.569784
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001789 | Grad Max: 0.009115
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010002 | Grad Max: 0.010002
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002205 | Grad Max: 0.095881
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042344 | Grad Max: 0.535378
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000583 | Grad Max: 0.012295
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019491 | Grad Max: 0.065240
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000123 | Grad Max: 0.001608
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005006 | Grad Max: 0.012105
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000074 | Grad Max: 0.000885
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002116 | Grad Max: 0.005856
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005940 | Grad Max: 0.013931
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.079019 | Grad Max: 0.079019
[GRADIENT NORM TOTAL] 5.2178

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5257787  0.47422132] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 0/2048 | B: 127/1921 | C: 216/1832
[LOSS Ex1] A: 0.00000 | B: 0.68456 | C: 0.68386
[LOGITS Ex2 A] Mean Abs: 0.906 | Max: 3.868
[LOSS Ex2] A: 0.41577 | B: 0.46173 | C: 0.43326
** [JOINT LOSS] ** : 0.893063
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004417 | Grad Max: 0.120156
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207726 | Grad Max: 1.262976
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001332 | Grad Max: 0.005507
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.020607 | Grad Max: 0.020607
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001801 | Grad Max: 0.075290
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034295 | Grad Max: 0.423118
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000471 | Grad Max: 0.010700
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015706 | Grad Max: 0.051935
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000099 | Grad Max: 0.001301
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004034 | Grad Max: 0.009598
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000060 | Grad Max: 0.000722
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001711 | Grad Max: 0.004612
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004786 | Grad Max: 0.009870
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.064589 | Grad Max: 0.064589
[GRADIENT NORM TOTAL] 4.2212

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.117
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51220924 0.4877908 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 0/1616 | B: 133/1723 | C: 203/1845
[LOSS Ex1] A: 0.00000 | B: 0.68624 | C: 0.68257
[LOGITS Ex2 A] Mean Abs: 0.894 | Max: 4.048
[LOSS Ex2] A: 0.38565 | B: 0.44125 | C: 0.47047
** [JOINT LOSS] ** : 0.888723
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001864 | Grad Max: 0.045406
  -> Layer: shared_layers.0.bias | Grad Mean: 0.019875 | Grad Max: 0.087856
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001040 | Grad Max: 0.004993
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006941 | Grad Max: 0.006941
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000272 | Grad Max: 0.025610
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004334 | Grad Max: 0.144034
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.002505
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001587 | Grad Max: 0.009628
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000253
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000426 | Grad Max: 0.001868
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000148
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000176 | Grad Max: 0.000697
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000614 | Grad Max: 0.002544
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006703 | Grad Max: 0.006703
[GRADIENT NORM TOTAL] 0.5506

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.227
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5106218 0.4893782] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 2/2046 | B: 128/1920 | C: 199/1849
[LOSS Ex1] A: 0.68406 | B: 0.68592 | C: 0.68269
[LOGITS Ex2 A] Mean Abs: 0.879 | Max: 4.162
[LOSS Ex2] A: 0.40121 | B: 0.46604 | C: 0.43272
** [JOINT LOSS] ** : 1.117548
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002475 | Grad Max: 0.060079
  -> Layer: shared_layers.0.bias | Grad Mean: 0.083778 | Grad Max: 0.479013
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001921 | Grad Max: 0.010175
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017274 | Grad Max: 0.017274
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000749 | Grad Max: 0.038670
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013972 | Grad Max: 0.213861
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.003960
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006263 | Grad Max: 0.020095
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000591
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001614 | Grad Max: 0.004303
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000323
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000682 | Grad Max: 0.001933
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001985 | Grad Max: 0.004789
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026148 | Grad Max: 0.026148
[GRADIENT NORM TOTAL] 1.6908

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.222
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50428486 0.4957151 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 138/1910 | C: 213/1835
[LOSS Ex1] A: 0.68397 | B: 0.68616 | C: 0.68371
[LOGITS Ex2 A] Mean Abs: 0.859 | Max: 3.993
[LOSS Ex2] A: 0.39865 | B: 0.44419 | C: 0.43523
** [JOINT LOSS] ** : 1.110633
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001903 | Grad Max: 0.042184
  -> Layer: shared_layers.0.bias | Grad Mean: 0.073431 | Grad Max: 0.413499
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.010720
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022378 | Grad Max: 0.022378
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000622 | Grad Max: 0.027579
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011491 | Grad Max: 0.147508
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000156 | Grad Max: 0.004581
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005188 | Grad Max: 0.018807
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000500
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001344 | Grad Max: 0.003814
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000330
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000563 | Grad Max: 0.001822
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001663 | Grad Max: 0.004850
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020945 | Grad Max: 0.020945
[GRADIENT NORM TOTAL] 1.4449

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.227
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50926006 0.4907399 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 4/2044 | B: 127/1921 | C: 221/1827
[LOSS Ex1] A: 0.68308 | B: 0.68446 | C: 0.68100
[LOGITS Ex2 A] Mean Abs: 0.892 | Max: 4.002
[LOSS Ex2] A: 0.40651 | B: 0.43981 | C: 0.44363
** [JOINT LOSS] ** : 1.112829
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003407 | Grad Max: 0.112847
  -> Layer: shared_layers.0.bias | Grad Mean: 0.105245 | Grad Max: 0.611043
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.010381
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011301 | Grad Max: 0.011301
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001031 | Grad Max: 0.048134
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019112 | Grad Max: 0.256926
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000261 | Grad Max: 0.006098
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008536 | Grad Max: 0.030141
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000787
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002185 | Grad Max: 0.005304
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000454
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000922 | Grad Max: 0.002615
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002532 | Grad Max: 0.005693
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034189 | Grad Max: 0.034189
[GRADIENT NORM TOTAL] 2.2326

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.202
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51074445 0.48925558] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 2/2046 | B: 134/1722 | C: 244/1804
[LOSS Ex1] A: 0.68443 | B: 0.68613 | C: 0.67933
[LOGITS Ex2 A] Mean Abs: 0.888 | Max: 3.998
[LOSS Ex2] A: 0.40921 | B: 0.43983 | C: 0.42373
** [JOINT LOSS] ** : 1.107554
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002722 | Grad Max: 0.074738
  -> Layer: shared_layers.0.bias | Grad Mean: 0.096372 | Grad Max: 0.544514
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001867 | Grad Max: 0.009163
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007225 | Grad Max: 0.007225
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000872 | Grad Max: 0.040852
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016399 | Grad Max: 0.220382
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000225 | Grad Max: 0.005276
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007439 | Grad Max: 0.025759
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000644
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001899 | Grad Max: 0.004728
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000394
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000809 | Grad Max: 0.002396
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002212 | Grad Max: 0.005645
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030404 | Grad Max: 0.030404
[GRADIENT NORM TOTAL] 1.9474

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.144
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5038853 0.4961147] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 129/1919 | C: 217/1831
[LOSS Ex1] A: 0.68406 | B: 0.68581 | C: 0.68041
[LOGITS Ex2 A] Mean Abs: 0.828 | Max: 3.761
[LOSS Ex2] A: 0.40028 | B: 0.45402 | C: 0.44444
** [JOINT LOSS] ** : 1.116334
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001494 | Grad Max: 0.043796
  -> Layer: shared_layers.0.bias | Grad Mean: 0.062049 | Grad Max: 0.390537
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001841 | Grad Max: 0.008442
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002821 | Grad Max: 0.002821
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000585 | Grad Max: 0.041597
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010837 | Grad Max: 0.226150
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000145 | Grad Max: 0.003746
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004877 | Grad Max: 0.017263
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000431
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001259 | Grad Max: 0.003135
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000247
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000534 | Grad Max: 0.001536
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001660 | Grad Max: 0.004499
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021715 | Grad Max: 0.021715
[GRADIENT NORM TOTAL] 1.3588

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.205
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52569985 0.47430015] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 141/1907 | C: 225/1823
[LOSS Ex1] A: 0.68400 | B: 0.68605 | C: 0.68160
[LOGITS Ex2 A] Mean Abs: 0.862 | Max: 3.802
[LOSS Ex2] A: 0.39726 | B: 0.44786 | C: 0.43976
** [JOINT LOSS] ** : 1.112172
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001472 | Grad Max: 0.029595
  -> Layer: shared_layers.0.bias | Grad Mean: 0.030065 | Grad Max: 0.172416
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001974 | Grad Max: 0.010226
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017991 | Grad Max: 0.017991
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000357 | Grad Max: 0.023048
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006096 | Grad Max: 0.119529
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.002758
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002498 | Grad Max: 0.013571
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000313
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000644 | Grad Max: 0.002138
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000205
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000273 | Grad Max: 0.001040
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000987 | Grad Max: 0.003157
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012046 | Grad Max: 0.012046
[GRADIENT NORM TOTAL] 0.7164

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.229
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071367  0.49286327] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 3/2045 | B: 127/1921 | C: 244/1804
[LOSS Ex1] A: 0.68506 | B: 0.68433 | C: 0.68056
[LOGITS Ex2 A] Mean Abs: 0.914 | Max: 3.916
[LOSS Ex2] A: 0.37902 | B: 0.44053 | C: 0.41107
** [JOINT LOSS] ** : 1.093522
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001631 | Grad Max: 0.030929
  -> Layer: shared_layers.0.bias | Grad Mean: 0.074335 | Grad Max: 0.445686
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001930 | Grad Max: 0.009660
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013553 | Grad Max: 0.013553
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000596 | Grad Max: 0.030112
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011218 | Grad Max: 0.166252
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000152 | Grad Max: 0.003910
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005129 | Grad Max: 0.020468
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000455
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001286 | Grad Max: 0.003408
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000279
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000545 | Grad Max: 0.001668
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001376 | Grad Max: 0.004385
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020409 | Grad Max: 0.020409
[GRADIENT NORM TOTAL] 1.4699

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5267029 0.4732971] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 0/2048 | B: 134/1722 | C: 160/1216
[LOSS Ex1] A: 0.00000 | B: 0.68601 | C: 0.68014
[LOGITS Ex2 A] Mean Abs: 0.886 | Max: 4.165
[LOSS Ex2] A: 0.40681 | B: 0.43759 | C: 0.41315
** [JOINT LOSS] ** : 0.874566
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001323 | Grad Max: 0.017181
  -> Layer: shared_layers.0.bias | Grad Mean: 0.020848 | Grad Max: 0.133769
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001155 | Grad Max: 0.005266
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009209 | Grad Max: 0.009209
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000211 | Grad Max: 0.035727
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.003582 | Grad Max: 0.196897
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002199
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001247 | Grad Max: 0.009574
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000239
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000300 | Grad Max: 0.001464
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000167
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000139 | Grad Max: 0.000923
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000816 | Grad Max: 0.002617
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005880 | Grad Max: 0.005880
[GRADIENT NORM TOTAL] 0.5293

[EPOCH SUMMARY] Train Loss: 1.0512

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0864 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0980 -> New: 1.0864)

############################## EPOCH 17/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.120
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51294786 0.48705214] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 0/1616 | B: 130/1918 | C: 225/1823
[LOSS Ex1] A: 0.00000 | B: 0.68568 | C: 0.68134
[LOGITS Ex2 A] Mean Abs: 0.913 | Max: 3.896
[LOSS Ex2] A: 0.39461 | B: 0.46118 | C: 0.43143
** [JOINT LOSS] ** : 0.884746
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.055038
  -> Layer: shared_layers.0.bias | Grad Mean: 0.067780 | Grad Max: 0.378242
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001155 | Grad Max: 0.005091
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012619 | Grad Max: 0.012619
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000641 | Grad Max: 0.035616
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011567 | Grad Max: 0.180236
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000158 | Grad Max: 0.004128
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005278 | Grad Max: 0.018436
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000564
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001363 | Grad Max: 0.003364
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000321
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000567 | Grad Max: 0.001848
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001724 | Grad Max: 0.004282
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021690 | Grad Max: 0.021690
[GRADIENT NORM TOTAL] 1.3920

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.231
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5103812 0.4896188] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 2/2046 | B: 143/1905 | C: 236/1812
[LOSS Ex1] A: 0.68386 | B: 0.68592 | C: 0.68151
[LOGITS Ex2 A] Mean Abs: 0.925 | Max: 4.098
[LOSS Ex2] A: 0.40267 | B: 0.44647 | C: 0.43113
** [JOINT LOSS] ** : 1.110520
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001332 | Grad Max: 0.036418
  -> Layer: shared_layers.0.bias | Grad Mean: 0.035328 | Grad Max: 0.180566
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001885 | Grad Max: 0.009974
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014690 | Grad Max: 0.014690
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000332 | Grad Max: 0.021158
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006027 | Grad Max: 0.112314
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.003064
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002432 | Grad Max: 0.014657
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000330
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000611 | Grad Max: 0.001923
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000188
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000252 | Grad Max: 0.001035
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000599 | Grad Max: 0.002266
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008144 | Grad Max: 0.008144
[GRADIENT NORM TOTAL] 0.7514

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.225
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50389457 0.49610546] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 127/1921 | C: 221/1827
[LOSS Ex1] A: 0.68372 | B: 0.68418 | C: 0.68166
[LOGITS Ex2 A] Mean Abs: 0.916 | Max: 4.210
[LOSS Ex2] A: 0.39187 | B: 0.42916 | C: 0.42168
** [JOINT LOSS] ** : 1.097426
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001292 | Grad Max: 0.017251
  -> Layer: shared_layers.0.bias | Grad Mean: 0.010881 | Grad Max: 0.079772
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.010810
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018295 | Grad Max: 0.018295
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000160 | Grad Max: 0.019682
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.002270 | Grad Max: 0.111924
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000029 | Grad Max: 0.001832
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000635 | Grad Max: 0.006618
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000195
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000133 | Grad Max: 0.000889
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000111
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000055 | Grad Max: 0.000475
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000749 | Grad Max: 0.002079
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000029 | Grad Max: 0.000029
[GRADIENT NORM TOTAL] 0.3289

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.230
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50896627 0.49103376] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 4/2044 | B: 134/1722 | C: 197/1851
[LOSS Ex1] A: 0.68283 | B: 0.68586 | C: 0.68368
[LOGITS Ex2 A] Mean Abs: 0.928 | Max: 4.508
[LOSS Ex2] A: 0.39535 | B: 0.43602 | C: 0.44188
** [JOINT LOSS] ** : 1.108540
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001449 | Grad Max: 0.031632
  -> Layer: shared_layers.0.bias | Grad Mean: 0.041764 | Grad Max: 0.256673
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001949 | Grad Max: 0.010308
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015618 | Grad Max: 0.015618
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000399 | Grad Max: 0.019510
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006800 | Grad Max: 0.110206
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.002685
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002626 | Grad Max: 0.012104
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000301
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000650 | Grad Max: 0.002221
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000181
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000270 | Grad Max: 0.001121
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000797 | Grad Max: 0.002680
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010639 | Grad Max: 0.010639
[GRADIENT NORM TOTAL] 0.8587

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.204
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104768  0.48952317] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 2/2046 | B: 130/1918 | C: 234/1814
[LOSS Ex1] A: 0.68423 | B: 0.68552 | C: 0.68138
[LOGITS Ex2 A] Mean Abs: 0.914 | Max: 4.258
[LOSS Ex2] A: 0.40070 | B: 0.45908 | C: 0.42844
** [JOINT LOSS] ** : 1.113118
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001475 | Grad Max: 0.035359
  -> Layer: shared_layers.0.bias | Grad Mean: 0.059925 | Grad Max: 0.343280
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001763 | Grad Max: 0.008671
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008451 | Grad Max: 0.008451
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000483 | Grad Max: 0.026277
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009010 | Grad Max: 0.143426
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000116 | Grad Max: 0.003344
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003835 | Grad Max: 0.013804
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000425
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000951 | Grad Max: 0.003013
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000236
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000382 | Grad Max: 0.001253
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000883 | Grad Max: 0.002414
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012854 | Grad Max: 0.012854
[GRADIENT NORM TOTAL] 1.1869

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.146
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044901  0.49550986] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.007
[MASKS] A(Pass/Fail): 1/2047 | B: 144/1904 | C: 247/1801
[LOSS Ex1] A: 0.68385 | B: 0.68576 | C: 0.67952
[LOGITS Ex2 A] Mean Abs: 0.895 | Max: 4.391
[LOSS Ex2] A: 0.39381 | B: 0.44478 | C: 0.42799
** [JOINT LOSS] ** : 1.105236
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001778 | Grad Max: 0.047290
  -> Layer: shared_layers.0.bias | Grad Mean: 0.020806 | Grad Max: 0.102459
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001864 | Grad Max: 0.008561
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001383 | Grad Max: 0.001383
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000266 | Grad Max: 0.017881
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004454 | Grad Max: 0.097392
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.003075
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001875 | Grad Max: 0.010899
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000274
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000486 | Grad Max: 0.001711
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000143
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000198 | Grad Max: 0.000736
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000821 | Grad Max: 0.002719
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008510 | Grad Max: 0.008510
[GRADIENT NORM TOTAL] 0.5355

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5270523  0.47294778] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 1/2047 | B: 128/1920 | C: 208/1840
[LOSS Ex1] A: 0.68369 | B: 0.68401 | C: 0.68308
[LOGITS Ex2 A] Mean Abs: 0.943 | Max: 4.507
[LOSS Ex2] A: 0.38491 | B: 0.44137 | C: 0.44218
** [JOINT LOSS] ** : 1.106415
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001765 | Grad Max: 0.039053
  -> Layer: shared_layers.0.bias | Grad Mean: 0.052567 | Grad Max: 0.330271
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.010672
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.021163 | Grad Max: 0.021163
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000463 | Grad Max: 0.023262
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008671 | Grad Max: 0.128519
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000118 | Grad Max: 0.003510
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003989 | Grad Max: 0.016820
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000379
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000979 | Grad Max: 0.002836
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000231
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000397 | Grad Max: 0.001301
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000931 | Grad Max: 0.002892
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013813 | Grad Max: 0.013813
[GRADIENT NORM TOTAL] 1.0945

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.234
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068086  0.49319148] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 3/2045 | B: 135/1721 | C: 231/1817
[LOSS Ex1] A: 0.68481 | B: 0.68570 | C: 0.68096
[LOGITS Ex2 A] Mean Abs: 0.933 | Max: 4.288
[LOSS Ex2] A: 0.39978 | B: 0.42745 | C: 0.43220
** [JOINT LOSS] ** : 1.103632
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003737 | Grad Max: 0.101406
  -> Layer: shared_layers.0.bias | Grad Mean: 0.080079 | Grad Max: 0.427770
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001788 | Grad Max: 0.008766
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012580 | Grad Max: 0.012580
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000813 | Grad Max: 0.061041
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014659 | Grad Max: 0.307049
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000193 | Grad Max: 0.004699
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006338 | Grad Max: 0.022008
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000584
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001607 | Grad Max: 0.003855
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000333
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000655 | Grad Max: 0.001964
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002066 | Grad Max: 0.004657
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025284 | Grad Max: 0.025284
[GRADIENT NORM TOTAL] 1.7521

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52806    0.47193998] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 0/2048 | B: 132/1916 | C: 228/1820
[LOSS Ex1] A: 0.00000 | B: 0.68535 | C: 0.68067
[LOGITS Ex2 A] Mean Abs: 0.928 | Max: 4.399
[LOSS Ex2] A: 0.39462 | B: 0.46194 | C: 0.43539
** [JOINT LOSS] ** : 0.885988
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001841 | Grad Max: 0.043215
  -> Layer: shared_layers.0.bias | Grad Mean: 0.064381 | Grad Max: 0.397835
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001182 | Grad Max: 0.005337
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011015 | Grad Max: 0.011015
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000606 | Grad Max: 0.058803
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011054 | Grad Max: 0.334285
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000145 | Grad Max: 0.004520
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004814 | Grad Max: 0.020439
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000466
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001199 | Grad Max: 0.003454
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000275
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000483 | Grad Max: 0.001457
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001504 | Grad Max: 0.003486
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018269 | Grad Max: 0.018269
[GRADIENT NORM TOTAL] 1.4570

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.124
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5140021  0.48599786] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 0/1616 | B: 144/1904 | C: 238/1810
[LOSS Ex1] A: 0.00000 | B: 0.68560 | C: 0.68105
[LOGITS Ex2 A] Mean Abs: 1.003 | Max: 4.162
[LOSS Ex2] A: 0.38482 | B: 0.44620 | C: 0.43731
** [JOINT LOSS] ** : 0.878329
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003411 | Grad Max: 0.104121
  -> Layer: shared_layers.0.bias | Grad Mean: 0.100101 | Grad Max: 0.545095
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001108 | Grad Max: 0.005435
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007526 | Grad Max: 0.007526
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000966 | Grad Max: 0.044281
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017970 | Grad Max: 0.239407
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000240 | Grad Max: 0.005817
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007926 | Grad Max: 0.029930
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000642
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001957 | Grad Max: 0.005000
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000392
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000785 | Grad Max: 0.002267
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002222 | Grad Max: 0.004948
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028397 | Grad Max: 0.028397
[GRADIENT NORM TOTAL] 2.0778

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.236
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5100983 0.4899017] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 2/2046 | B: 128/1920 | C: 229/1819
[LOSS Ex1] A: 0.68354 | B: 0.68383 | C: 0.67981
[LOGITS Ex2 A] Mean Abs: 0.988 | Max: 5.048
[LOSS Ex2] A: 0.40170 | B: 0.44208 | C: 0.42305
** [JOINT LOSS] ** : 1.104673
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004612 | Grad Max: 0.148670
  -> Layer: shared_layers.0.bias | Grad Mean: 0.140364 | Grad Max: 0.782256
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001998 | Grad Max: 0.009659
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008097 | Grad Max: 0.008097
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001320 | Grad Max: 0.063392
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024487 | Grad Max: 0.335590
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000328 | Grad Max: 0.008016
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010873 | Grad Max: 0.036998
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000848
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002698 | Grad Max: 0.006339
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000531
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001076 | Grad Max: 0.002879
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003085 | Grad Max: 0.005850
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038776 | Grad Max: 0.038776
[GRADIENT NORM TOTAL] 2.8737

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.229
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.503425 0.496575] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 1/2047 | B: 135/1721 | C: 230/1818
[LOSS Ex1] A: 0.68337 | B: 0.68553 | C: 0.68031
[LOGITS Ex2 A] Mean Abs: 0.957 | Max: 4.203
[LOSS Ex2] A: 0.38450 | B: 0.42459 | C: 0.45147
** [JOINT LOSS] ** : 1.103257
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001434 | Grad Max: 0.043666
  -> Layer: shared_layers.0.bias | Grad Mean: 0.055729 | Grad Max: 0.348707
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001964 | Grad Max: 0.010282
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014600 | Grad Max: 0.014600
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000505 | Grad Max: 0.031430
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008811 | Grad Max: 0.179186
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.003186
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003606 | Grad Max: 0.015426
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000407
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000900 | Grad Max: 0.002729
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000227
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000362 | Grad Max: 0.001220
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001040 | Grad Max: 0.002555
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014188 | Grad Max: 0.014188
[GRADIENT NORM TOTAL] 1.1900

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.235
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50861263 0.4913874 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 4/2044 | B: 132/1916 | C: 214/1834
[LOSS Ex1] A: 0.68248 | B: 0.68519 | C: 0.68154
[LOGITS Ex2 A] Mean Abs: 0.954 | Max: 4.339
[LOSS Ex2] A: 0.38966 | B: 0.45685 | C: 0.42298
** [JOINT LOSS] ** : 1.106233
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001976 | Grad Max: 0.057758
  -> Layer: shared_layers.0.bias | Grad Mean: 0.081006 | Grad Max: 0.501004
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.010534
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014468 | Grad Max: 0.014468
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000743 | Grad Max: 0.046758
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012969 | Grad Max: 0.265081
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000162 | Grad Max: 0.004678
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005408 | Grad Max: 0.022936
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000503
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001323 | Grad Max: 0.003733
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000296
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000529 | Grad Max: 0.001688
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001476 | Grad Max: 0.003920
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019959 | Grad Max: 0.019959
[GRADIENT NORM TOTAL] 1.6954

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.208
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5101837  0.48981625] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 2/2046 | B: 145/1903 | C: 140/1236
[LOSS Ex1] A: 0.68397 | B: 0.68544 | C: 0.68114
[LOGITS Ex2 A] Mean Abs: 0.966 | Max: 4.321
[LOSS Ex2] A: 0.37702 | B: 0.44031 | C: 0.43515
** [JOINT LOSS] ** : 1.101010
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001506 | Grad Max: 0.023875
  -> Layer: shared_layers.0.bias | Grad Mean: 0.054680 | Grad Max: 0.330586
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001781 | Grad Max: 0.008914
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009138 | Grad Max: 0.009138
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000464 | Grad Max: 0.029429
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008470 | Grad Max: 0.167503
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000107 | Grad Max: 0.004165
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003587 | Grad Max: 0.018441
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000339
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000860 | Grad Max: 0.002670
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000281
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000341 | Grad Max: 0.001238
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000735 | Grad Max: 0.002763
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011444 | Grad Max: 0.011444
[GRADIENT NORM TOTAL] 1.1379

[EPOCH SUMMARY] Train Loss: 1.0578

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0761 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0864 -> New: 1.0761)

############################## EPOCH 18/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.148
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50518167 0.49481836] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 2/2046 | B: 130/1918 | C: 221/1827
[LOSS Ex1] A: 0.68358 | B: 0.68366 | C: 0.68179
[LOGITS Ex2 A] Mean Abs: 0.929 | Max: 4.297
[LOSS Ex2] A: 0.38223 | B: 0.44288 | C: 0.43252
** [JOINT LOSS] ** : 1.102214
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001366 | Grad Max: 0.027965
  -> Layer: shared_layers.0.bias | Grad Mean: 0.030940 | Grad Max: 0.231439
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001827 | Grad Max: 0.009095
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007431 | Grad Max: 0.007431
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000311 | Grad Max: 0.032581
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005317 | Grad Max: 0.182568
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.002370
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001681 | Grad Max: 0.010042
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000306
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000379 | Grad Max: 0.001505
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000145
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000150 | Grad Max: 0.000692
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000470 | Grad Max: 0.001811
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005432 | Grad Max: 0.005432
[GRADIENT NORM TOTAL] 0.7187

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.213
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52862984 0.4713702 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.008
[MASKS] A(Pass/Fail): 1/2047 | B: 136/1720 | C: 213/1835
[LOSS Ex1] A: 0.68334 | B: 0.68537 | C: 0.68066
[LOGITS Ex2 A] Mean Abs: 0.982 | Max: 5.019
[LOSS Ex2] A: 0.37694 | B: 0.42463 | C: 0.42356
** [JOINT LOSS] ** : 1.091500
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002703 | Grad Max: 0.075449
  -> Layer: shared_layers.0.bias | Grad Mean: 0.078013 | Grad Max: 0.424874
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001993 | Grad Max: 0.010461
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018169 | Grad Max: 0.018169
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000739 | Grad Max: 0.038404
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013496 | Grad Max: 0.213731
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000176 | Grad Max: 0.005135
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005871 | Grad Max: 0.023869
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000513
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001452 | Grad Max: 0.003651
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000292
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000574 | Grad Max: 0.001704
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001818 | Grad Max: 0.004966
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021654 | Grad Max: 0.021654
[GRADIENT NORM TOTAL] 1.5997

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.239
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50648326 0.49351668] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 3/2045 | B: 132/1916 | C: 246/1802
[LOSS Ex1] A: 0.68452 | B: 0.68502 | C: 0.68001
[LOGITS Ex2 A] Mean Abs: 0.976 | Max: 4.307
[LOSS Ex2] A: 0.39080 | B: 0.45223 | C: 0.41307
** [JOINT LOSS] ** : 1.101884
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.055476
  -> Layer: shared_layers.0.bias | Grad Mean: 0.022125 | Grad Max: 0.114261
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001802 | Grad Max: 0.009029
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009375 | Grad Max: 0.009375
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000299 | Grad Max: 0.018715
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004436 | Grad Max: 0.069392
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.002020
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001327 | Grad Max: 0.008990
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000212
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000345 | Grad Max: 0.001486
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000155
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000143 | Grad Max: 0.000684
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000629 | Grad Max: 0.002054
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006199 | Grad Max: 0.006199
[GRADIENT NORM TOTAL] 0.5858

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.087
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5294981  0.47050187] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 0/2048 | B: 146/1902 | C: 226/1822
[LOSS Ex1] A: 0.00000 | B: 0.68528 | C: 0.68107
[LOGITS Ex2 A] Mean Abs: 0.995 | Max: 4.402
[LOSS Ex2] A: 0.38579 | B: 0.45116 | C: 0.42474
** [JOINT LOSS] ** : 0.876009
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002336 | Grad Max: 0.055560
  -> Layer: shared_layers.0.bias | Grad Mean: 0.100298 | Grad Max: 0.592342
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001162 | Grad Max: 0.005203
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012743 | Grad Max: 0.012743
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000857 | Grad Max: 0.040604
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016164 | Grad Max: 0.233396
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.006302
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007441 | Grad Max: 0.034691
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000540
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001790 | Grad Max: 0.004390
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000347
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000704 | Grad Max: 0.001957
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001852 | Grad Max: 0.004123
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024880 | Grad Max: 0.024880
[GRADIENT NORM TOTAL] 2.0622

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.128
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5150813 0.4849187] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.008
[MASKS] A(Pass/Fail): 0/1616 | B: 130/1918 | C: 228/1820
[LOSS Ex1] A: 0.00000 | B: 0.68348 | C: 0.68159
[LOGITS Ex2 A] Mean Abs: 1.030 | Max: 4.586
[LOSS Ex2] A: 0.37365 | B: 0.43036 | C: 0.42989
** [JOINT LOSS] ** : 0.866324
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002219 | Grad Max: 0.050695
  -> Layer: shared_layers.0.bias | Grad Mean: 0.084209 | Grad Max: 0.490123
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001204 | Grad Max: 0.005184
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013601 | Grad Max: 0.013601
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000725 | Grad Max: 0.066515
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013542 | Grad Max: 0.376407
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000182 | Grad Max: 0.005188
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006085 | Grad Max: 0.023624
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000520
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001472 | Grad Max: 0.003624
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000348
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000570 | Grad Max: 0.001799
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001553 | Grad Max: 0.003574
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020352 | Grad Max: 0.020352
[GRADIENT NORM TOTAL] 1.7422

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.241
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5098773 0.4901227] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 2/2046 | B: 136/1720 | C: 243/1805
[LOSS Ex1] A: 0.68321 | B: 0.68521 | C: 0.67947
[LOGITS Ex2 A] Mean Abs: 0.989 | Max: 5.067
[LOSS Ex2] A: 0.37965 | B: 0.42728 | C: 0.44940
** [JOINT LOSS] ** : 1.101404
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002301 | Grad Max: 0.072137
  -> Layer: shared_layers.0.bias | Grad Mean: 0.122643 | Grad Max: 0.775028
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001909 | Grad Max: 0.009879
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010264 | Grad Max: 0.010264
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001062 | Grad Max: 0.053038
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020077 | Grad Max: 0.305860
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000280 | Grad Max: 0.006432
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009534 | Grad Max: 0.030597
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000744
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002326 | Grad Max: 0.006118
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000409
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000908 | Grad Max: 0.002493
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002538 | Grad Max: 0.005158
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033667 | Grad Max: 0.033667
[GRADIENT NORM TOTAL] 2.5723

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.234
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5030059  0.49699402] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.009
[MASKS] A(Pass/Fail): 1/2047 | B: 134/1914 | C: 235/1813
[LOSS Ex1] A: 0.68301 | B: 0.68486 | C: 0.68070
[LOGITS Ex2 A] Mean Abs: 0.978 | Max: 4.597
[LOSS Ex2] A: 0.37037 | B: 0.46054 | C: 0.44665
** [JOINT LOSS] ** : 1.108714
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002463 | Grad Max: 0.087341
  -> Layer: shared_layers.0.bias | Grad Mean: 0.139236 | Grad Max: 0.880457
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.010799
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.021632 | Grad Max: 0.021632
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001209 | Grad Max: 0.063758
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022721 | Grad Max: 0.343404
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000304 | Grad Max: 0.007752
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010300 | Grad Max: 0.037712
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000828
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002496 | Grad Max: 0.006091
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000507
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000988 | Grad Max: 0.003056
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002878 | Grad Max: 0.006105
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037683 | Grad Max: 0.037683
[GRADIENT NORM TOTAL] 2.9228

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.239
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50830007 0.4916999 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 5/2043 | B: 147/1901 | C: 206/1842
[LOSS Ex1] A: 0.68212 | B: 0.68512 | C: 0.68151
[LOGITS Ex2 A] Mean Abs: 0.991 | Max: 4.866
[LOSS Ex2] A: 0.39510 | B: 0.44359 | C: 0.42008
** [JOINT LOSS] ** : 1.102508
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003406 | Grad Max: 0.116625
  -> Layer: shared_layers.0.bias | Grad Mean: 0.039946 | Grad Max: 0.183652
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001920 | Grad Max: 0.009907
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009543 | Grad Max: 0.009543
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000544 | Grad Max: 0.025601
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007185 | Grad Max: 0.139218
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.002582
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.000997 | Grad Max: 0.012667
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000223
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000150 | Grad Max: 0.000948
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000103
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000059 | Grad Max: 0.000375
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000325 | Grad Max: 0.001342
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001862 | Grad Max: 0.001862
[GRADIENT NORM TOTAL] 1.0289

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.211
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50992167 0.4900783 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 2/2046 | B: 130/1918 | C: 223/1825
[LOSS Ex1] A: 0.68370 | B: 0.68331 | C: 0.68129
[LOGITS Ex2 A] Mean Abs: 1.018 | Max: 4.685
[LOSS Ex2] A: 0.39000 | B: 0.42877 | C: 0.42418
** [JOINT LOSS] ** : 1.097086
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003957 | Grad Max: 0.095260
  -> Layer: shared_layers.0.bias | Grad Mean: 0.166597 | Grad Max: 1.015105
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001823 | Grad Max: 0.008802
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007839 | Grad Max: 0.007839
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.080671
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027312 | Grad Max: 0.458161
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.008843
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012182 | Grad Max: 0.043272
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000923
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002961 | Grad Max: 0.006588
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000516
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001147 | Grad Max: 0.003237
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002994 | Grad Max: 0.006482
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040468 | Grad Max: 0.040468
[GRADIENT NORM TOTAL] 3.3973

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.151
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50582075 0.49417922] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.008
[MASKS] A(Pass/Fail): 2/2046 | B: 137/1719 | C: 208/1840
[LOSS Ex1] A: 0.68331 | B: 0.68506 | C: 0.68142
[LOGITS Ex2 A] Mean Abs: 0.993 | Max: 4.457
[LOSS Ex2] A: 0.37589 | B: 0.42330 | C: 0.41235
** [JOINT LOSS] ** : 1.087109
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002613 | Grad Max: 0.050132
  -> Layer: shared_layers.0.bias | Grad Mean: 0.081399 | Grad Max: 0.483575
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001730 | Grad Max: 0.008501
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001682 | Grad Max: 0.001682
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000761 | Grad Max: 0.038403
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013864 | Grad Max: 0.216687
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000185 | Grad Max: 0.004824
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006145 | Grad Max: 0.024005
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000539
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001475 | Grad Max: 0.003790
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000310
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000577 | Grad Max: 0.001775
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001568 | Grad Max: 0.004366
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020926 | Grad Max: 0.020926
[GRADIENT NORM TOTAL] 1.6861

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.217
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53015053 0.4698495 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 1/2047 | B: 137/1911 | C: 222/1826
[LOSS Ex1] A: 0.68299 | B: 0.68471 | C: 0.68203
[LOGITS Ex2 A] Mean Abs: 1.004 | Max: 4.778
[LOSS Ex2] A: 0.38918 | B: 0.46156 | C: 0.44097
** [JOINT LOSS] ** : 1.113813
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003982 | Grad Max: 0.119144
  -> Layer: shared_layers.0.bias | Grad Mean: 0.092328 | Grad Max: 0.508912
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.010560
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.021617 | Grad Max: 0.021617
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000905 | Grad Max: 0.055473
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016639 | Grad Max: 0.282534
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000217 | Grad Max: 0.006116
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007255 | Grad Max: 0.028071
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000608
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001780 | Grad Max: 0.004266
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000353
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000701 | Grad Max: 0.002124
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002072 | Grad Max: 0.004487
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026428 | Grad Max: 0.026428
[GRADIENT NORM TOTAL] 1.9907

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.244
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062049  0.49379513] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 3/2045 | B: 148/1900 | C: 255/1793
[LOSS Ex1] A: 0.68425 | B: 0.68498 | C: 0.67830
[LOGITS Ex2 A] Mean Abs: 1.014 | Max: 4.611
[LOSS Ex2] A: 0.37593 | B: 0.43827 | C: 0.40569
** [JOINT LOSS] ** : 1.089138
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005411 | Grad Max: 0.161034
  -> Layer: shared_layers.0.bias | Grad Mean: 0.111527 | Grad Max: 0.611851
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001802 | Grad Max: 0.008470
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005857 | Grad Max: 0.005857
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001143 | Grad Max: 0.052000
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020472 | Grad Max: 0.244815
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000273 | Grad Max: 0.006894
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008929 | Grad Max: 0.034655
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000730
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002187 | Grad Max: 0.005439
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000386
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000850 | Grad Max: 0.002385
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002679 | Grad Max: 0.005682
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032236 | Grad Max: 0.032236
[GRADIENT NORM TOTAL] 2.4034

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.090
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5308137  0.46918628] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 0/2048 | B: 133/1915 | C: 242/1806
[LOSS Ex1] A: 0.00000 | B: 0.68316 | C: 0.68013
[LOGITS Ex2 A] Mean Abs: 1.027 | Max: 4.585
[LOSS Ex2] A: 0.37872 | B: 0.43486 | C: 0.40746
** [JOINT LOSS] ** : 0.861442
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002230 | Grad Max: 0.064303
  -> Layer: shared_layers.0.bias | Grad Mean: 0.086425 | Grad Max: 0.496068
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001219 | Grad Max: 0.005246
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011295 | Grad Max: 0.011295
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000768 | Grad Max: 0.052432
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013424 | Grad Max: 0.298382
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000145 | Grad Max: 0.004095
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004664 | Grad Max: 0.019703
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000445
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001082 | Grad Max: 0.002878
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000222
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000419 | Grad Max: 0.001319
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000898 | Grad Max: 0.002653
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014665 | Grad Max: 0.014665
[GRADIENT NORM TOTAL] 1.8033

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.131
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5160247 0.4839753] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 1/1615 | B: 137/1719 | C: 155/1221
[LOSS Ex1] A: 0.68253 | B: 0.68492 | C: 0.68039
[LOGITS Ex2 A] Mean Abs: 1.085 | Max: 4.679
[LOSS Ex2] A: 0.36566 | B: 0.42323 | C: 0.44127
** [JOINT LOSS] ** : 1.092666
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002660 | Grad Max: 0.052425
  -> Layer: shared_layers.0.bias | Grad Mean: 0.062812 | Grad Max: 0.343283
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001889 | Grad Max: 0.009257
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002320 | Grad Max: 0.002320
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000572 | Grad Max: 0.032207
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009404 | Grad Max: 0.180940
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000106 | Grad Max: 0.003854
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003421 | Grad Max: 0.019334
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000375
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000801 | Grad Max: 0.002500
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000224
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000308 | Grad Max: 0.001305
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000696 | Grad Max: 0.002680
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010857 | Grad Max: 0.010857
[GRADIENT NORM TOTAL] 1.2379

[EPOCH SUMMARY] Train Loss: 1.0494

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0679 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0761 -> New: 1.0679)

############################## EPOCH 19/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.246
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5096955  0.49030453] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 2/2046 | B: 138/1910 | C: 219/1829
[LOSS Ex1] A: 0.68292 | B: 0.68457 | C: 0.68114
[LOGITS Ex2 A] Mean Abs: 1.040 | Max: 5.381
[LOSS Ex2] A: 0.35619 | B: 0.45310 | C: 0.42314
** [JOINT LOSS] ** : 1.093686
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001375 | Grad Max: 0.034492
  -> Layer: shared_layers.0.bias | Grad Mean: 0.018601 | Grad Max: 0.109097
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001971 | Grad Max: 0.010106
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017811 | Grad Max: 0.017811
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000230 | Grad Max: 0.017639
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.003598 | Grad Max: 0.097229
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002156
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001219 | Grad Max: 0.007355
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000231
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000280 | Grad Max: 0.001556
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000143
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000105 | Grad Max: 0.000498
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000447 | Grad Max: 0.001705
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003808 | Grad Max: 0.003808
[GRADIENT NORM TOTAL] 0.4705

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.238
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50266856 0.4973315 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 1/2047 | B: 149/1899 | C: 236/1812
[LOSS Ex1] A: 0.68270 | B: 0.68484 | C: 0.67977
[LOGITS Ex2 A] Mean Abs: 1.005 | Max: 4.552
[LOSS Ex2] A: 0.37148 | B: 0.44475 | C: 0.42521
** [JOINT LOSS] ** : 1.096254
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001493 | Grad Max: 0.055633
  -> Layer: shared_layers.0.bias | Grad Mean: 0.068249 | Grad Max: 0.437536
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001945 | Grad Max: 0.009827
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009703 | Grad Max: 0.009703
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000619 | Grad Max: 0.031182
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011314 | Grad Max: 0.167805
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000154 | Grad Max: 0.005036
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005231 | Grad Max: 0.023848
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000487
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001243 | Grad Max: 0.003389
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000228
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000478 | Grad Max: 0.001457
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001321 | Grad Max: 0.003488
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017882 | Grad Max: 0.017882
[GRADIENT NORM TOTAL] 1.4827

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.243
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080542 0.4919458] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 5/2043 | B: 134/1914 | C: 218/1830
[LOSS Ex1] A: 0.68181 | B: 0.68300 | C: 0.68030
[LOGITS Ex2 A] Mean Abs: 1.032 | Max: 4.886
[LOSS Ex2] A: 0.39412 | B: 0.42534 | C: 0.42032
** [JOINT LOSS] ** : 1.094965
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004972 | Grad Max: 0.171887
  -> Layer: shared_layers.0.bias | Grad Mean: 0.083107 | Grad Max: 0.388099
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.010329
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009804 | Grad Max: 0.009804
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000898 | Grad Max: 0.048274
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015639 | Grad Max: 0.227361
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000195 | Grad Max: 0.005055
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006279 | Grad Max: 0.023733
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000592
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001536 | Grad Max: 0.004375
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000333
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000586 | Grad Max: 0.001970
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001615 | Grad Max: 0.003499
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020425 | Grad Max: 0.020425
[GRADIENT NORM TOTAL] 1.8932

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.214
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50973535 0.49026462] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 2/2046 | B: 137/1719 | C: 226/1822
[LOSS Ex1] A: 0.68347 | B: 0.68477 | C: 0.68155
[LOGITS Ex2 A] Mean Abs: 1.005 | Max: 4.636
[LOSS Ex2] A: 0.37940 | B: 0.42830 | C: 0.42335
** [JOINT LOSS] ** : 1.093611
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002408 | Grad Max: 0.065587
  -> Layer: shared_layers.0.bias | Grad Mean: 0.026533 | Grad Max: 0.098073
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001709 | Grad Max: 0.008469
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006808 | Grad Max: 0.006808
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000340 | Grad Max: 0.020708
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005396 | Grad Max: 0.106076
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.002377
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001741 | Grad Max: 0.010403
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000233
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000427 | Grad Max: 0.001779
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000177
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000162 | Grad Max: 0.000880
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000430 | Grad Max: 0.001805
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005079 | Grad Max: 0.005079
[GRADIENT NORM TOTAL] 0.6762

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.153
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063159  0.49368408] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.511 | Std: 0.009
[MASKS] A(Pass/Fail): 2/2046 | B: 139/1909 | C: 247/1801
[LOSS Ex1] A: 0.68307 | B: 0.68442 | C: 0.67885
[LOGITS Ex2 A] Mean Abs: 0.978 | Max: 4.585
[LOSS Ex2] A: 0.37126 | B: 0.45644 | C: 0.42247
** [JOINT LOSS] ** : 1.098837
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.063346
  -> Layer: shared_layers.0.bias | Grad Mean: 0.116037 | Grad Max: 0.719553
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001857 | Grad Max: 0.008649
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000334 | Grad Max: 0.000334
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001013 | Grad Max: 0.079399
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018775 | Grad Max: 0.448394
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000246 | Grad Max: 0.006095
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008430 | Grad Max: 0.030593
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000583
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002017 | Grad Max: 0.004802
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000337
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000773 | Grad Max: 0.002224
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002067 | Grad Max: 0.004463
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027726 | Grad Max: 0.027726
[GRADIENT NORM TOTAL] 2.4608

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.221
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5314298  0.46857014] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 1/2047 | B: 149/1899 | C: 219/1829
[LOSS Ex1] A: 0.68269 | B: 0.68469 | C: 0.68192
[LOGITS Ex2 A] Mean Abs: 1.012 | Max: 4.589
[LOSS Ex2] A: 0.35782 | B: 0.43318 | C: 0.43366
** [JOINT LOSS] ** : 1.091318
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001861 | Grad Max: 0.046363
  -> Layer: shared_layers.0.bias | Grad Mean: 0.051421 | Grad Max: 0.287754
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001952 | Grad Max: 0.009855
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016739 | Grad Max: 0.016739
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000498 | Grad Max: 0.030112
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008935 | Grad Max: 0.170696
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.003423
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003963 | Grad Max: 0.017181
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000465
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000953 | Grad Max: 0.002696
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000234
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000366 | Grad Max: 0.001190
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001162 | Grad Max: 0.003173
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014220 | Grad Max: 0.014220
[GRADIENT NORM TOTAL] 1.0728

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.249
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50600255 0.49399745] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 3/2045 | B: 135/1913 | C: 252/1796
[LOSS Ex1] A: 0.68400 | B: 0.68285 | C: 0.67995
[LOGITS Ex2 A] Mean Abs: 1.053 | Max: 4.235
[LOSS Ex2] A: 0.37184 | B: 0.44861 | C: 0.42255
** [JOINT LOSS] ** : 1.096598
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003248 | Grad Max: 0.113707
  -> Layer: shared_layers.0.bias | Grad Mean: 0.227232 | Grad Max: 1.428709
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001898 | Grad Max: 0.009293
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013367 | Grad Max: 0.013367
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001798 | Grad Max: 0.095065
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034770 | Grad Max: 0.539935
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.011843
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016553 | Grad Max: 0.062107
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001237
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003913 | Grad Max: 0.009651
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000716
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001487 | Grad Max: 0.004203
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003793 | Grad Max: 0.007653
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053146 | Grad Max: 0.053146
[GRADIENT NORM TOTAL] 4.6962

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.093
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5320062  0.46799383] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 0/2048 | B: 138/1718 | C: 248/1800
[LOSS Ex1] A: 0.00000 | B: 0.68463 | C: 0.67896
[LOGITS Ex2 A] Mean Abs: 1.079 | Max: 4.524
[LOSS Ex2] A: 0.39363 | B: 0.44457 | C: 0.41547
** [JOINT LOSS] ** : 0.872424
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004290 | Grad Max: 0.123652
  -> Layer: shared_layers.0.bias | Grad Mean: 0.236934 | Grad Max: 1.456883
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001161 | Grad Max: 0.005305
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009442 | Grad Max: 0.009442
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001965 | Grad Max: 0.108264
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037340 | Grad Max: 0.601556
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000510 | Grad Max: 0.012307
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017491 | Grad Max: 0.064642
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000098 | Grad Max: 0.001212
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004144 | Grad Max: 0.009639
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000638
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001579 | Grad Max: 0.004233
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004242 | Grad Max: 0.008531
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057470 | Grad Max: 0.057470
[GRADIENT NORM TOTAL] 4.8734

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.134
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51689625 0.48310378] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 1/1615 | B: 140/1908 | C: 244/1804
[LOSS Ex1] A: 0.68224 | B: 0.68428 | C: 0.68044
[LOGITS Ex2 A] Mean Abs: 1.082 | Max: 4.836
[LOSS Ex2] A: 0.34554 | B: 0.45815 | C: 0.40775
** [JOINT LOSS] ** : 1.086133
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.044557
  -> Layer: shared_layers.0.bias | Grad Mean: 0.089982 | Grad Max: 0.498503
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.009850
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010445 | Grad Max: 0.010445
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000743 | Grad Max: 0.044633
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013359 | Grad Max: 0.246807
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.005117
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005693 | Grad Max: 0.024139
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000516
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001311 | Grad Max: 0.003579
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000223
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000500 | Grad Max: 0.001523
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001077 | Grad Max: 0.003088
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017377 | Grad Max: 0.017377
[GRADIENT NORM TOTAL] 1.7899

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.251
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50957453 0.49042544] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 2/2046 | B: 149/1899 | C: 215/1833
[LOSS Ex1] A: 0.68262 | B: 0.68456 | C: 0.68005
[LOGITS Ex2 A] Mean Abs: 1.054 | Max: 4.551
[LOSS Ex2] A: 0.37367 | B: 0.44152 | C: 0.44570
** [JOINT LOSS] ** : 1.102707
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006315 | Grad Max: 0.154545
  -> Layer: shared_layers.0.bias | Grad Mean: 0.197956 | Grad Max: 1.145364
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001869 | Grad Max: 0.009291
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010106 | Grad Max: 0.010106
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001823 | Grad Max: 0.083490
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033943 | Grad Max: 0.433136
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.010533
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015491 | Grad Max: 0.054168
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001044
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003705 | Grad Max: 0.008505
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000591
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001415 | Grad Max: 0.003758
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004003 | Grad Max: 0.007356
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052040 | Grad Max: 0.052040
[GRADIENT NORM TOTAL] 4.0813

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.242
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023866 0.4976134] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 1/2047 | B: 136/1912 | C: 231/1817
[LOSS Ex1] A: 0.68238 | B: 0.68271 | C: 0.68122
[LOGITS Ex2 A] Mean Abs: 1.040 | Max: 5.081
[LOSS Ex2] A: 0.37993 | B: 0.43511 | C: 0.43448
** [JOINT LOSS] ** : 1.098608
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006336 | Grad Max: 0.162087
  -> Layer: shared_layers.0.bias | Grad Mean: 0.243172 | Grad Max: 1.404818
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.010726
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019589 | Grad Max: 0.019589
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.106992
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040077 | Grad Max: 0.586746
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000542 | Grad Max: 0.012120
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018336 | Grad Max: 0.062320
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000105 | Grad Max: 0.001289
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004385 | Grad Max: 0.011009
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.000689
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001666 | Grad Max: 0.004495
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004516 | Grad Max: 0.008945
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.059878 | Grad Max: 0.059878
[GRADIENT NORM TOTAL] 4.9687

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.247
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50785494 0.49214506] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 5/2043 | B: 140/1716 | C: 244/1804
[LOSS Ex1] A: 0.68149 | B: 0.68451 | C: 0.67924
[LOGITS Ex2 A] Mean Abs: 1.044 | Max: 4.997
[LOSS Ex2] A: 0.37454 | B: 0.42681 | C: 0.42622
** [JOINT LOSS] ** : 1.090936
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002957 | Grad Max: 0.072199
  -> Layer: shared_layers.0.bias | Grad Mean: 0.145065 | Grad Max: 0.835343
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001973 | Grad Max: 0.009899
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009021 | Grad Max: 0.009021
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001214 | Grad Max: 0.060798
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022776 | Grad Max: 0.346510
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000318 | Grad Max: 0.006956
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010921 | Grad Max: 0.037715
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000842
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002608 | Grad Max: 0.006154
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000412
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000998 | Grad Max: 0.002690
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002773 | Grad Max: 0.005749
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037081 | Grad Max: 0.037081
[GRADIENT NORM TOTAL] 2.9533

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.217
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50958365 0.49041632] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 2/2046 | B: 142/1906 | C: 221/1827
[LOSS Ex1] A: 0.68323 | B: 0.68416 | C: 0.68065
[LOGITS Ex2 A] Mean Abs: 1.061 | Max: 4.774
[LOSS Ex2] A: 0.37540 | B: 0.46054 | C: 0.42869
** [JOINT LOSS] ** : 1.104222
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003385 | Grad Max: 0.092664
  -> Layer: shared_layers.0.bias | Grad Mean: 0.176653 | Grad Max: 1.052156
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001779 | Grad Max: 0.008624
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008311 | Grad Max: 0.008311
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001432 | Grad Max: 0.069732
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027795 | Grad Max: 0.393444
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000382 | Grad Max: 0.008707
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013104 | Grad Max: 0.045341
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000914
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003055 | Grad Max: 0.007395
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000477
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001150 | Grad Max: 0.003183
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002813 | Grad Max: 0.006623
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039974 | Grad Max: 0.039974
[GRADIENT NORM TOTAL] 3.5599

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.154
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.506802 0.493198] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.009
[MASKS] A(Pass/Fail): 2/2046 | B: 149/1899 | C: 155/1221
[LOSS Ex1] A: 0.68284 | B: 0.68445 | C: 0.67971
[LOGITS Ex2 A] Mean Abs: 1.056 | Max: 4.788
[LOSS Ex2] A: 0.38368 | B: 0.46178 | C: 0.42245
** [JOINT LOSS] ** : 1.104968
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006473 | Grad Max: 0.171747
  -> Layer: shared_layers.0.bias | Grad Mean: 0.269928 | Grad Max: 1.597622
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001804 | Grad Max: 0.008786
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003781 | Grad Max: 0.003781
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.108202
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043983 | Grad Max: 0.611451
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000591 | Grad Max: 0.015925
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020170 | Grad Max: 0.077199
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000113 | Grad Max: 0.001517
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004769 | Grad Max: 0.011849
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000061 | Grad Max: 0.000791
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001802 | Grad Max: 0.005029
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004748 | Grad Max: 0.009739
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.063938 | Grad Max: 0.063938
[GRADIENT NORM TOTAL] 5.4609

[EPOCH SUMMARY] Train Loss: 1.0804

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0679 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0679 -> New: 1.0679)

############################## EPOCH 20/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.225
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5326474 0.4673526] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 1/2047 | B: 136/1912 | C: 251/1797
[LOSS Ex1] A: 0.68241 | B: 0.68259 | C: 0.67897
[LOGITS Ex2 A] Mean Abs: 1.070 | Max: 4.965
[LOSS Ex2] A: 0.36606 | B: 0.42882 | C: 0.39508
** [JOINT LOSS] ** : 1.077975
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003845 | Grad Max: 0.108094
  -> Layer: shared_layers.0.bias | Grad Mean: 0.152221 | Grad Max: 0.912392
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.010569
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018771 | Grad Max: 0.018771
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001294 | Grad Max: 0.065392
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024828 | Grad Max: 0.370909
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000330 | Grad Max: 0.008556
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011393 | Grad Max: 0.041198
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000917
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002706 | Grad Max: 0.006724
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000413
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001030 | Grad Max: 0.002928
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002630 | Grad Max: 0.006100
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036922 | Grad Max: 0.036922
[GRADIENT NORM TOTAL] 3.1025

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.253
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5058149 0.4941851] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 3/2045 | B: 141/1715 | C: 235/1813
[LOSS Ex1] A: 0.68378 | B: 0.68441 | C: 0.67853
[LOGITS Ex2 A] Mean Abs: 1.033 | Max: 4.304
[LOSS Ex2] A: 0.34848 | B: 0.42150 | C: 0.44147
** [JOINT LOSS] ** : 1.086058
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003666 | Grad Max: 0.100122
  -> Layer: shared_layers.0.bias | Grad Mean: 0.156550 | Grad Max: 0.931207
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001781 | Grad Max: 0.008550
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005516 | Grad Max: 0.005516
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001348 | Grad Max: 0.067403
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024855 | Grad Max: 0.374202
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000340 | Grad Max: 0.007774
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011647 | Grad Max: 0.039715
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000850
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002777 | Grad Max: 0.006745
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000497
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001052 | Grad Max: 0.003049
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002859 | Grad Max: 0.005787
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038174 | Grad Max: 0.038174
[GRADIENT NORM TOTAL] 3.2255

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.095
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53302664 0.4669734 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 0/2048 | B: 143/1905 | C: 212/1836
[LOSS Ex1] A: 0.00000 | B: 0.68406 | C: 0.68176
[LOGITS Ex2 A] Mean Abs: 1.007 | Max: 4.437
[LOSS Ex2] A: 0.36836 | B: 0.46774 | C: 0.45093
** [JOINT LOSS] ** : 0.884282
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003553 | Grad Max: 0.103743
  -> Layer: shared_layers.0.bias | Grad Mean: 0.210486 | Grad Max: 1.246884
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001247 | Grad Max: 0.004795
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017403 | Grad Max: 0.017403
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001728 | Grad Max: 0.093486
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032724 | Grad Max: 0.519659
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000451 | Grad Max: 0.010704
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015640 | Grad Max: 0.053806
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.001113
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003733 | Grad Max: 0.008774
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000614
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001427 | Grad Max: 0.004055
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003756 | Grad Max: 0.007736
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052019 | Grad Max: 0.052019
[GRADIENT NORM TOTAL] 4.3326

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.137
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51760423 0.48239574] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 1/1615 | B: 149/1899 | C: 246/1802
[LOSS Ex1] A: 0.68202 | B: 0.68435 | C: 0.67861
[LOGITS Ex2 A] Mean Abs: 1.049 | Max: 4.916
[LOSS Ex2] A: 0.35876 | B: 0.43080 | C: 0.40205
** [JOINT LOSS] ** : 1.078864
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001900 | Grad Max: 0.055545
  -> Layer: shared_layers.0.bias | Grad Mean: 0.100777 | Grad Max: 0.596611
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001950 | Grad Max: 0.009138
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002988 | Grad Max: 0.002988
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000885 | Grad Max: 0.060917
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016317 | Grad Max: 0.340202
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000225 | Grad Max: 0.006679
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007756 | Grad Max: 0.035238
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000672
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001847 | Grad Max: 0.004711
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000322
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000695 | Grad Max: 0.001937
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001900 | Grad Max: 0.004735
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025217 | Grad Max: 0.025217
[GRADIENT NORM TOTAL] 2.1545

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.254
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5094613  0.49053872] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 2/2046 | B: 136/1912 | C: 263/1785
[LOSS Ex1] A: 0.68240 | B: 0.68249 | C: 0.67841
[LOGITS Ex2 A] Mean Abs: 1.096 | Max: 5.266
[LOSS Ex2] A: 0.38301 | B: 0.42916 | C: 0.42568
** [JOINT LOSS] ** : 1.093714
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005415 | Grad Max: 0.151652
  -> Layer: shared_layers.0.bias | Grad Mean: 0.198683 | Grad Max: 1.154045
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.009614
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009021 | Grad Max: 0.009021
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001795 | Grad Max: 0.082295
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033927 | Grad Max: 0.460100
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000450 | Grad Max: 0.011645
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015330 | Grad Max: 0.057780
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.001056
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003629 | Grad Max: 0.008948
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000570
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001374 | Grad Max: 0.003612
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003636 | Grad Max: 0.007991
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048906 | Grad Max: 0.048906
[GRADIENT NORM TOTAL] 4.1166

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.245
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50215775 0.49784225] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 1/2047 | B: 142/1714 | C: 231/1817
[LOSS Ex1] A: 0.68215 | B: 0.68432 | C: 0.67997
[LOGITS Ex2 A] Mean Abs: 1.094 | Max: 4.248
[LOSS Ex2] A: 0.38263 | B: 0.44352 | C: 0.41697
** [JOINT LOSS] ** : 1.096519
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006393 | Grad Max: 0.170892
  -> Layer: shared_layers.0.bias | Grad Mean: 0.257339 | Grad Max: 1.557462
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.010697
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017794 | Grad Max: 0.017794
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.105972
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043141 | Grad Max: 0.592697
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000585 | Grad Max: 0.013228
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020073 | Grad Max: 0.068517
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001304
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004748 | Grad Max: 0.010821
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000059 | Grad Max: 0.000722
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001791 | Grad Max: 0.004967
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004660 | Grad Max: 0.010094
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.063788 | Grad Max: 0.063788
[GRADIENT NORM TOTAL] 5.2771

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.250
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5077141  0.49228588] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 5/2043 | B: 143/1905 | C: 243/1805
[LOSS Ex1] A: 0.68126 | B: 0.68397 | C: 0.68203
[LOGITS Ex2 A] Mean Abs: 1.063 | Max: 4.721
[LOSS Ex2] A: 0.36639 | B: 0.45138 | C: 0.42704
** [JOINT LOSS] ** : 1.097353
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004829 | Grad Max: 0.122097
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141521 | Grad Max: 0.802264
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.010896
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.021480 | Grad Max: 0.021480
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001312 | Grad Max: 0.057534
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024331 | Grad Max: 0.320149
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000322 | Grad Max: 0.009461
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010850 | Grad Max: 0.044149
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000792
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002572 | Grad Max: 0.006560
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000438
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000972 | Grad Max: 0.002688
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002492 | Grad Max: 0.005100
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035039 | Grad Max: 0.035039
[GRADIENT NORM TOTAL] 2.8969

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.220
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50947654 0.4905234 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 2/2046 | B: 149/1899 | C: 228/1820
[LOSS Ex1] A: 0.68305 | B: 0.68427 | C: 0.67983
[LOGITS Ex2 A] Mean Abs: 1.001 | Max: 4.605
[LOSS Ex2] A: 0.37208 | B: 0.44451 | C: 0.42279
** [JOINT LOSS] ** : 1.095510
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004545 | Grad Max: 0.131208
  -> Layer: shared_layers.0.bias | Grad Mean: 0.186119 | Grad Max: 1.093284
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001756 | Grad Max: 0.008378
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005130 | Grad Max: 0.005130
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001622 | Grad Max: 0.099009
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030497 | Grad Max: 0.538911
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000409 | Grad Max: 0.009521
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013975 | Grad Max: 0.054979
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001053
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003332 | Grad Max: 0.008580
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000481
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001255 | Grad Max: 0.003258
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003378 | Grad Max: 0.006791
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045023 | Grad Max: 0.045023
[GRADIENT NORM TOTAL] 3.8135

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.156
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071312  0.49286878] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 2/2046 | B: 136/1912 | C: 228/1820
[LOSS Ex1] A: 0.68265 | B: 0.68240 | C: 0.68059
[LOGITS Ex2 A] Mean Abs: 0.980 | Max: 4.393
[LOSS Ex2] A: 0.36636 | B: 0.43730 | C: 0.43643
** [JOINT LOSS] ** : 1.095241
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006261 | Grad Max: 0.172233
  -> Layer: shared_layers.0.bias | Grad Mean: 0.274738 | Grad Max: 1.666761
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001831 | Grad Max: 0.008735
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004383 | Grad Max: 0.004383
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002386 | Grad Max: 0.127585
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045346 | Grad Max: 0.692875
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.017136
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021005 | Grad Max: 0.085723
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000116 | Grad Max: 0.001389
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004967 | Grad Max: 0.011233
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000062 | Grad Max: 0.000730
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001880 | Grad Max: 0.005033
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004960 | Grad Max: 0.009934
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.067569 | Grad Max: 0.067569
[GRADIENT NORM TOTAL] 5.6749

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.227
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5335702  0.46642986] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.010
[MASKS] A(Pass/Fail): 1/2047 | B: 142/1714 | C: 216/1832
[LOSS Ex1] A: 0.68218 | B: 0.68424 | C: 0.68101
[LOGITS Ex2 A] Mean Abs: 1.035 | Max: 4.798
[LOSS Ex2] A: 0.35380 | B: 0.43311 | C: 0.45605
** [JOINT LOSS] ** : 1.096797
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004098 | Grad Max: 0.105308
  -> Layer: shared_layers.0.bias | Grad Mean: 0.168035 | Grad Max: 0.981702
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.010670
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.021004 | Grad Max: 0.021004
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001457 | Grad Max: 0.073461
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027022 | Grad Max: 0.404966
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000365 | Grad Max: 0.009372
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012445 | Grad Max: 0.044669
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000847
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002949 | Grad Max: 0.007131
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000504
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001119 | Grad Max: 0.003107
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002957 | Grad Max: 0.005999
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039673 | Grad Max: 0.039673
[GRADIENT NORM TOTAL] 3.4457

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.256
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50569296 0.494307  ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 3/2045 | B: 144/1904 | C: 235/1813
[LOSS Ex1] A: 0.68361 | B: 0.68389 | C: 0.67995
[LOGITS Ex2 A] Mean Abs: 1.058 | Max: 4.517
[LOSS Ex2] A: 0.35956 | B: 0.45905 | C: 0.39264
** [JOINT LOSS] ** : 1.086233
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001804 | Grad Max: 0.048241
  -> Layer: shared_layers.0.bias | Grad Mean: 0.111403 | Grad Max: 0.656656
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001755 | Grad Max: 0.008401
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007093 | Grad Max: 0.007093
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000895 | Grad Max: 0.048810
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016652 | Grad Max: 0.272283
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000227 | Grad Max: 0.006306
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007822 | Grad Max: 0.029115
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000611
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001820 | Grad Max: 0.004955
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000320
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000692 | Grad Max: 0.002000
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001604 | Grad Max: 0.003700
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024632 | Grad Max: 0.024632
[GRADIENT NORM TOTAL] 2.3045

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.097
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5338051  0.46619493] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 0/2048 | B: 150/1898 | C: 235/1813
[LOSS Ex1] A: 0.00000 | B: 0.68419 | C: 0.67933
[LOGITS Ex2 A] Mean Abs: 1.072 | Max: 4.435
[LOSS Ex2] A: 0.36685 | B: 0.44868 | C: 0.40327
** [JOINT LOSS] ** : 0.860774
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003721 | Grad Max: 0.084158
  -> Layer: shared_layers.0.bias | Grad Mean: 0.167985 | Grad Max: 1.020169
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001118 | Grad Max: 0.005378
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008691 | Grad Max: 0.008691
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001416 | Grad Max: 0.079536
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026776 | Grad Max: 0.448562
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000360 | Grad Max: 0.008011
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012440 | Grad Max: 0.040228
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000898
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002946 | Grad Max: 0.007362
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000477
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001117 | Grad Max: 0.003257
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002783 | Grad Max: 0.005783
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039245 | Grad Max: 0.039245
[GRADIENT NORM TOTAL] 3.5010

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.139
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51816875 0.48183125] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.010
[MASKS] A(Pass/Fail): 1/1615 | B: 136/1912 | C: 220/1828
[LOSS Ex1] A: 0.68184 | B: 0.68232 | C: 0.67998
[LOGITS Ex2 A] Mean Abs: 1.084 | Max: 4.716
[LOSS Ex2] A: 0.35506 | B: 0.42589 | C: 0.42415
** [JOINT LOSS] ** : 1.083076
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001684 | Grad Max: 0.025734
  -> Layer: shared_layers.0.bias | Grad Mean: 0.046365 | Grad Max: 0.235782
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.008927
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000621 | Grad Max: 0.000621
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000408 | Grad Max: 0.026088
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007313 | Grad Max: 0.139165
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000095 | Grad Max: 0.003775
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003145 | Grad Max: 0.016575
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000311
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000739 | Grad Max: 0.002178
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000161
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000287 | Grad Max: 0.001017
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000728 | Grad Max: 0.002491
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010689 | Grad Max: 0.010689
[GRADIENT NORM TOTAL] 0.9601

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.257
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093909  0.49060914] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.010
[MASKS] A(Pass/Fail): 2/2046 | B: 142/1714 | C: 154/1222
[LOSS Ex1] A: 0.68221 | B: 0.68416 | C: 0.68010
[LOGITS Ex2 A] Mean Abs: 1.045 | Max: 4.640
[LOSS Ex2] A: 0.37205 | B: 0.42208 | C: 0.43939
** [JOINT LOSS] ** : 1.093327
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004130 | Grad Max: 0.104686
  -> Layer: shared_layers.0.bias | Grad Mean: 0.188129 | Grad Max: 1.115011
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001857 | Grad Max: 0.009461
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011913 | Grad Max: 0.011913
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001556 | Grad Max: 0.081064
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029379 | Grad Max: 0.454788
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000400 | Grad Max: 0.010230
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013833 | Grad Max: 0.048470
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000998
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003285 | Grad Max: 0.007861
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000510
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001239 | Grad Max: 0.003345
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003116 | Grad Max: 0.006121
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043952 | Grad Max: 0.043952
[GRADIENT NORM TOTAL] 3.7850

[EPOCH SUMMARY] Train Loss: 1.0590

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0691 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 21/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.247
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.502029   0.49797106] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 1/2047 | B: 144/1904 | C: 243/1805
[LOSS Ex1] A: 0.68195 | B: 0.68380 | C: 0.67818
[LOGITS Ex2 A] Mean Abs: 1.038 | Max: 4.523
[LOSS Ex2] A: 0.36211 | B: 0.46973 | C: 0.42540
** [JOINT LOSS] ** : 1.100387
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004909 | Grad Max: 0.122698
  -> Layer: shared_layers.0.bias | Grad Mean: 0.250350 | Grad Max: 1.480775
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002006 | Grad Max: 0.009974
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009844 | Grad Max: 0.009844
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.114782
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039260 | Grad Max: 0.641793
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000538 | Grad Max: 0.012701
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018596 | Grad Max: 0.068132
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001106
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004396 | Grad Max: 0.009540
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.000688
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001671 | Grad Max: 0.004395
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004330 | Grad Max: 0.008374
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.060433 | Grad Max: 0.060433
[GRADIENT NORM TOTAL] 5.1026

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.253
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076179  0.49238214] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 5/2043 | B: 151/1897 | C: 219/1829
[LOSS Ex1] A: 0.68106 | B: 0.68410 | C: 0.68166
[LOGITS Ex2 A] Mean Abs: 1.039 | Max: 4.463
[LOSS Ex2] A: 0.37256 | B: 0.44190 | C: 0.42357
** [JOINT LOSS] ** : 1.094952
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001926 | Grad Max: 0.064510
  -> Layer: shared_layers.0.bias | Grad Mean: 0.102820 | Grad Max: 0.599591
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002009 | Grad Max: 0.009989
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016494 | Grad Max: 0.016494
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000838 | Grad Max: 0.043043
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015204 | Grad Max: 0.238438
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000202 | Grad Max: 0.005564
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007035 | Grad Max: 0.028406
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000581
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001662 | Grad Max: 0.003944
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000272
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000626 | Grad Max: 0.001836
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001549 | Grad Max: 0.003420
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022593 | Grad Max: 0.022593
[GRADIENT NORM TOTAL] 2.0688

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.222
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5094007  0.49059927] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 2/2046 | B: 136/1912 | C: 224/1824
[LOSS Ex1] A: 0.68290 | B: 0.68222 | C: 0.68114
[LOGITS Ex2 A] Mean Abs: 1.062 | Max: 4.690
[LOSS Ex2] A: 0.36419 | B: 0.43987 | C: 0.42208
** [JOINT LOSS] ** : 1.090798
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004338 | Grad Max: 0.138156
  -> Layer: shared_layers.0.bias | Grad Mean: 0.226274 | Grad Max: 1.349641
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001818 | Grad Max: 0.008837
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008893 | Grad Max: 0.008893
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001880 | Grad Max: 0.101695
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035758 | Grad Max: 0.579624
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000471 | Grad Max: 0.010058
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016303 | Grad Max: 0.054939
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.001145
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003846 | Grad Max: 0.009742
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000530
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001450 | Grad Max: 0.003749
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003482 | Grad Max: 0.006903
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050419 | Grad Max: 0.050419
[GRADIENT NORM TOTAL] 4.6036

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.157
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073779  0.49262208] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 2/2046 | B: 143/1713 | C: 224/1824
[LOSS Ex1] A: 0.68251 | B: 0.68407 | C: 0.67907
[LOGITS Ex2 A] Mean Abs: 1.047 | Max: 4.474
[LOSS Ex2] A: 0.38247 | B: 0.43839 | C: 0.43340
** [JOINT LOSS] ** : 1.099967
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006495 | Grad Max: 0.173904
  -> Layer: shared_layers.0.bias | Grad Mean: 0.271802 | Grad Max: 1.589790
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001787 | Grad Max: 0.008284
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000067 | Grad Max: 0.000067
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002321 | Grad Max: 0.112749
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043516 | Grad Max: 0.587782
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000578 | Grad Max: 0.014666
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019766 | Grad Max: 0.072264
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000109 | Grad Max: 0.001354
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004679 | Grad Max: 0.011204
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.000685
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001756 | Grad Max: 0.004572
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004400 | Grad Max: 0.008448
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.061358 | Grad Max: 0.061358
[GRADIENT NORM TOTAL] 5.4867

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.230
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5343116 0.4656884] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.010
[MASKS] A(Pass/Fail): 1/2047 | B: 144/1904 | C: 217/1831
[LOSS Ex1] A: 0.68200 | B: 0.68372 | C: 0.68031
[LOGITS Ex2 A] Mean Abs: 1.077 | Max: 4.966
[LOSS Ex2] A: 0.36482 | B: 0.45411 | C: 0.40406
** [JOINT LOSS] ** : 1.089676
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004718 | Grad Max: 0.122908
  -> Layer: shared_layers.0.bias | Grad Mean: 0.192182 | Grad Max: 1.103099
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.010403
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018416 | Grad Max: 0.018416
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001643 | Grad Max: 0.085848
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030778 | Grad Max: 0.487310
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.009196
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014014 | Grad Max: 0.047070
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.001011
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003335 | Grad Max: 0.008298
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000526
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001265 | Grad Max: 0.003365
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003200 | Grad Max: 0.006675
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045394 | Grad Max: 0.045394
[GRADIENT NORM TOTAL] 3.8870

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.259
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50561965 0.49438038] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 3/2045 | B: 151/1897 | C: 236/1812
[LOSS Ex1] A: 0.68345 | B: 0.68402 | C: 0.67923
[LOGITS Ex2 A] Mean Abs: 1.045 | Max: 4.418
[LOSS Ex2] A: 0.36277 | B: 0.44386 | C: 0.41160
** [JOINT LOSS] ** : 1.088309
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003603 | Grad Max: 0.088777
  -> Layer: shared_layers.0.bias | Grad Mean: 0.087780 | Grad Max: 0.498775
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001807 | Grad Max: 0.008815
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010823 | Grad Max: 0.010823
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000875 | Grad Max: 0.044543
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015421 | Grad Max: 0.229121
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000197 | Grad Max: 0.006124
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006706 | Grad Max: 0.028189
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000558
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001624 | Grad Max: 0.004386
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000293
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000609 | Grad Max: 0.001927
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001721 | Grad Max: 0.004258
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022238 | Grad Max: 0.022238
[GRADIENT NORM TOTAL] 1.8698

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.099
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53446394 0.4655361 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 0/2048 | B: 137/1911 | C: 240/1808
[LOSS Ex1] A: 0.00000 | B: 0.68213 | C: 0.67969
[LOGITS Ex2 A] Mean Abs: 1.031 | Max: 4.734
[LOSS Ex2] A: 0.35863 | B: 0.42954 | C: 0.41004
** [JOINT LOSS] ** : 0.853341
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004314 | Grad Max: 0.116526
  -> Layer: shared_layers.0.bias | Grad Mean: 0.155855 | Grad Max: 0.946404
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001186 | Grad Max: 0.005385
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009792 | Grad Max: 0.009792
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001453 | Grad Max: 0.104574
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026925 | Grad Max: 0.573081
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000358 | Grad Max: 0.008132
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012292 | Grad Max: 0.041375
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000794
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002936 | Grad Max: 0.006607
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000461
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001108 | Grad Max: 0.003041
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003005 | Grad Max: 0.006481
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039451 | Grad Max: 0.039451
[GRADIENT NORM TOTAL] 3.3038

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.141
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5186696 0.4813304] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.010
[MASKS] A(Pass/Fail): 1/1615 | B: 143/1713 | C: 255/1793
[LOSS Ex1] A: 0.68166 | B: 0.68399 | C: 0.67785
[LOGITS Ex2 A] Mean Abs: 1.080 | Max: 4.509
[LOSS Ex2] A: 0.35450 | B: 0.41595 | C: 0.41556
** [JOINT LOSS] ** : 1.076504
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.045507
  -> Layer: shared_layers.0.bias | Grad Mean: 0.063289 | Grad Max: 0.338868
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.009282
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001986 | Grad Max: 0.001986
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000572 | Grad Max: 0.044190
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010367 | Grad Max: 0.256501
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.004031
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004505 | Grad Max: 0.017786
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000377
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001071 | Grad Max: 0.003251
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000214
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000393 | Grad Max: 0.001289
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001018 | Grad Max: 0.003266
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013448 | Grad Max: 0.013448
[GRADIENT NORM TOTAL] 1.3402

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.260
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093554  0.49064457] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/2046 | B: 145/1903 | C: 245/1803
[LOSS Ex1] A: 0.68204 | B: 0.68363 | C: 0.67949
[LOGITS Ex2 A] Mean Abs: 1.111 | Max: 4.837
[LOSS Ex2] A: 0.37808 | B: 0.46991 | C: 0.43509
** [JOINT LOSS] ** : 1.109416
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005647 | Grad Max: 0.142185
  -> Layer: shared_layers.0.bias | Grad Mean: 0.268410 | Grad Max: 1.553959
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001970 | Grad Max: 0.010031
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015841 | Grad Max: 0.015841
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002241 | Grad Max: 0.112011
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042393 | Grad Max: 0.635080
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000567 | Grad Max: 0.012715
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019668 | Grad Max: 0.066740
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000106 | Grad Max: 0.001290
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004657 | Grad Max: 0.011002
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000056 | Grad Max: 0.000696
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001752 | Grad Max: 0.004711
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004450 | Grad Max: 0.008942
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.061895 | Grad Max: 0.061895
[GRADIENT NORM TOTAL] 5.4119

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.250
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50189143 0.49810863] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.010
[MASKS] A(Pass/Fail): 2/2046 | B: 151/1897 | C: 224/1824
[LOSS Ex1] A: 0.68176 | B: 0.68394 | C: 0.68111
[LOGITS Ex2 A] Mean Abs: 1.132 | Max: 4.481
[LOSS Ex2] A: 0.38579 | B: 0.46657 | C: 0.42391
** [JOINT LOSS] ** : 1.107689
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007203 | Grad Max: 0.194034
  -> Layer: shared_layers.0.bias | Grad Mean: 0.322761 | Grad Max: 1.913378
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001961 | Grad Max: 0.010180
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015147 | Grad Max: 0.015147
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002783 | Grad Max: 0.141313
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052615 | Grad Max: 0.774162
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000701 | Grad Max: 0.015651
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024247 | Grad Max: 0.084011
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000131 | Grad Max: 0.001685
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005733 | Grad Max: 0.013539
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000070 | Grad Max: 0.000845
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002160 | Grad Max: 0.005653
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005504 | Grad Max: 0.011092
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.075711 | Grad Max: 0.075711
[GRADIENT NORM TOTAL] 6.6261

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.256
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5075382 0.4924618] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 5/2043 | B: 141/1907 | C: 235/1813
[LOSS Ex1] A: 0.68088 | B: 0.68204 | C: 0.67985
[LOGITS Ex2 A] Mean Abs: 1.098 | Max: 4.551
[LOSS Ex2] A: 0.38065 | B: 0.43560 | C: 0.41679
** [JOINT LOSS] ** : 1.091937
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005576 | Grad Max: 0.156752
  -> Layer: shared_layers.0.bias | Grad Mean: 0.198938 | Grad Max: 1.129021
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002050 | Grad Max: 0.010182
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011534 | Grad Max: 0.011534
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001778 | Grad Max: 0.084663
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033023 | Grad Max: 0.479800
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000436 | Grad Max: 0.011860
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014943 | Grad Max: 0.058129
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.001090
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003547 | Grad Max: 0.008816
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000531
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001340 | Grad Max: 0.003504
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003464 | Grad Max: 0.006762
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047394 | Grad Max: 0.047394
[GRADIENT NORM TOTAL] 4.1132

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.224
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093507 0.4906493] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 3/2045 | B: 144/1712 | C: 251/1797
[LOSS Ex1] A: 0.68275 | B: 0.68391 | C: 0.67960
[LOGITS Ex2 A] Mean Abs: 1.028 | Max: 4.572
[LOSS Ex2] A: 0.35330 | B: 0.41448 | C: 0.40399
** [JOINT LOSS] ** : 1.072673
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002556 | Grad Max: 0.071805
  -> Layer: shared_layers.0.bias | Grad Mean: 0.078239 | Grad Max: 0.384307
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001792 | Grad Max: 0.008620
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009703 | Grad Max: 0.009703
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000706 | Grad Max: 0.038589
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012772 | Grad Max: 0.215937
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000163 | Grad Max: 0.004428
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005513 | Grad Max: 0.020549
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000482
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001311 | Grad Max: 0.003756
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000262
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000486 | Grad Max: 0.001619
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001407 | Grad Max: 0.004608
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017262 | Grad Max: 0.017262
[GRADIENT NORM TOTAL] 1.5786

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.159
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076469  0.49235314] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 3/2045 | B: 148/1900 | C: 252/1796
[LOSS Ex1] A: 0.68235 | B: 0.68355 | C: 0.67796
[LOGITS Ex2 A] Mean Abs: 1.003 | Max: 4.603
[LOSS Ex2] A: 0.35678 | B: 0.45420 | C: 0.41041
** [JOINT LOSS] ** : 1.088418
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003448 | Grad Max: 0.095965
  -> Layer: shared_layers.0.bias | Grad Mean: 0.187842 | Grad Max: 1.110170
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001849 | Grad Max: 0.008457
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000863 | Grad Max: 0.000863
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001537 | Grad Max: 0.106326
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029292 | Grad Max: 0.608191
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.010031
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013488 | Grad Max: 0.055453
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000905
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003173 | Grad Max: 0.007687
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000466
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001195 | Grad Max: 0.003239
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003099 | Grad Max: 0.006405
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042433 | Grad Max: 0.042433
[GRADIENT NORM TOTAL] 3.8708

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.232
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53506166 0.46493837] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/2046 | B: 156/1892 | C: 148/1228
[LOSS Ex1] A: 0.68182 | B: 0.68386 | C: 0.67994
[LOGITS Ex2 A] Mean Abs: 1.047 | Max: 4.590
[LOSS Ex2] A: 0.33481 | B: 0.43873 | C: 0.41616
** [JOINT LOSS] ** : 1.078439
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001768 | Grad Max: 0.046445
  -> Layer: shared_layers.0.bias | Grad Mean: 0.076378 | Grad Max: 0.416662
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001979 | Grad Max: 0.010232
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015725 | Grad Max: 0.015725
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000625 | Grad Max: 0.043979
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011122 | Grad Max: 0.249136
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000146 | Grad Max: 0.004188
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005020 | Grad Max: 0.021502
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000452
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001179 | Grad Max: 0.003329
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000203
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000440 | Grad Max: 0.001370
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001187 | Grad Max: 0.003772
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015630 | Grad Max: 0.015630
[GRADIENT NORM TOTAL] 1.5029

[EPOCH SUMMARY] Train Loss: 1.0745

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0641 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0679 -> New: 1.0641)

############################## EPOCH 22/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.262
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5055422  0.49445778] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.010
[MASKS] A(Pass/Fail): 3/2045 | B: 148/1900 | C: 231/1817
[LOSS Ex1] A: 0.68330 | B: 0.68195 | C: 0.67994
[LOGITS Ex2 A] Mean Abs: 1.088 | Max: 4.311
[LOSS Ex2] A: 0.34913 | B: 0.42867 | C: 0.42333
** [JOINT LOSS] ** : 1.082106
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004530 | Grad Max: 0.120826
  -> Layer: shared_layers.0.bias | Grad Mean: 0.223453 | Grad Max: 1.330951
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001918 | Grad Max: 0.009348
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014347 | Grad Max: 0.014347
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001856 | Grad Max: 0.112556
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035517 | Grad Max: 0.635956
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000470 | Grad Max: 0.010714
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016419 | Grad Max: 0.058066
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.001070
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003871 | Grad Max: 0.008693
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000615
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001460 | Grad Max: 0.003974
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003580 | Grad Max: 0.006972
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051331 | Grad Max: 0.051331
[GRADIENT NORM TOTAL] 4.5770

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53511965 0.46488038] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.010
[MASKS] A(Pass/Fail): 0/2048 | B: 153/1703 | C: 248/1800
[LOSS Ex1] A: 0.00000 | B: 0.68382 | C: 0.68020
[LOGITS Ex2 A] Mean Abs: 1.100 | Max: 4.449
[LOSS Ex2] A: 0.37156 | B: 0.42652 | C: 0.41810
** [JOINT LOSS] ** : 0.860067
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006180 | Grad Max: 0.150771
  -> Layer: shared_layers.0.bias | Grad Mean: 0.266865 | Grad Max: 1.610612
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001275 | Grad Max: 0.005157
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018048 | Grad Max: 0.018048
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002277 | Grad Max: 0.115358
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043128 | Grad Max: 0.642819
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000569 | Grad Max: 0.013880
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019719 | Grad Max: 0.071334
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000106 | Grad Max: 0.001283
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004653 | Grad Max: 0.010626
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000056 | Grad Max: 0.000683
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001755 | Grad Max: 0.004712
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004533 | Grad Max: 0.008591
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.062595 | Grad Max: 0.062595
[GRADIENT NORM TOTAL] 5.4504

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.143
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51918674 0.48081324] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.010
[MASKS] A(Pass/Fail): 2/1614 | B: 153/1895 | C: 244/1804
[LOSS Ex1] A: 0.68149 | B: 0.68347 | C: 0.67732
[LOGITS Ex2 A] Mean Abs: 1.115 | Max: 4.934
[LOSS Ex2] A: 0.32734 | B: 0.45755 | C: 0.41069
** [JOINT LOSS] ** : 1.079283
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002300 | Grad Max: 0.055401
  -> Layer: shared_layers.0.bias | Grad Mean: 0.092571 | Grad Max: 0.550082
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001971 | Grad Max: 0.009032
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001047 | Grad Max: 0.001047
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000782 | Grad Max: 0.076852
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014909 | Grad Max: 0.434616
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000190 | Grad Max: 0.005944
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006553 | Grad Max: 0.027953
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000500
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001528 | Grad Max: 0.003864
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000280
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000573 | Grad Max: 0.001928
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001357 | Grad Max: 0.003557
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020145 | Grad Max: 0.020145
[GRADIENT NORM TOTAL] 1.9485

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.263
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093078 0.4906922] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/2046 | B: 163/1885 | C: 244/1804
[LOSS Ex1] A: 0.68186 | B: 0.68377 | C: 0.67885
[LOGITS Ex2 A] Mean Abs: 1.066 | Max: 4.653
[LOSS Ex2] A: 0.35689 | B: 0.44502 | C: 0.40143
** [JOINT LOSS] ** : 1.082612
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003889 | Grad Max: 0.097828
  -> Layer: shared_layers.0.bias | Grad Mean: 0.183383 | Grad Max: 1.105089
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001966 | Grad Max: 0.010127
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014868 | Grad Max: 0.014868
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001527 | Grad Max: 0.083980
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028722 | Grad Max: 0.471678
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000382 | Grad Max: 0.007888
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013300 | Grad Max: 0.043242
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000894
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003152 | Grad Max: 0.007374
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000456
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001180 | Grad Max: 0.003214
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003013 | Grad Max: 0.005795
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041744 | Grad Max: 0.041744
[GRADIENT NORM TOTAL] 3.7291

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.253
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50175244 0.49824756] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/2046 | B: 149/1899 | C: 227/1821
[LOSS Ex1] A: 0.68156 | B: 0.68186 | C: 0.68034
[LOGITS Ex2 A] Mean Abs: 1.056 | Max: 4.537
[LOSS Ex2] A: 0.35316 | B: 0.43409 | C: 0.45169
** [JOINT LOSS] ** : 1.094234
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006137 | Grad Max: 0.147072
  -> Layer: shared_layers.0.bias | Grad Mean: 0.263439 | Grad Max: 1.541981
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001996 | Grad Max: 0.009893
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012832 | Grad Max: 0.012832
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002247 | Grad Max: 0.112763
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042291 | Grad Max: 0.624063
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000565 | Grad Max: 0.013859
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019631 | Grad Max: 0.070626
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000105 | Grad Max: 0.001298
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004653 | Grad Max: 0.010898
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000056 | Grad Max: 0.000743
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001756 | Grad Max: 0.004849
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004597 | Grad Max: 0.008323
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.062975 | Grad Max: 0.062975
[GRADIENT NORM TOTAL] 5.3535

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.259
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50742227 0.49257767] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 6/2042 | B: 154/1702 | C: 246/1802
[LOSS Ex1] A: 0.68068 | B: 0.68374 | C: 0.68020
[LOGITS Ex2 A] Mean Abs: 1.059 | Max: 4.535
[LOSS Ex2] A: 0.35725 | B: 0.41507 | C: 0.42078
** [JOINT LOSS] ** : 1.079245
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002852 | Grad Max: 0.072068
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141598 | Grad Max: 0.831983
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001946 | Grad Max: 0.009864
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013707 | Grad Max: 0.013707
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001149 | Grad Max: 0.060755
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021966 | Grad Max: 0.352317
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000293 | Grad Max: 0.007297
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010265 | Grad Max: 0.037308
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000703
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002423 | Grad Max: 0.005888
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000399
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000908 | Grad Max: 0.002638
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002284 | Grad Max: 0.004926
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031776 | Grad Max: 0.031776
[GRADIENT NORM TOTAL] 2.8667

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.226
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5092548  0.49074516] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 3/2045 | B: 153/1895 | C: 222/1826
[LOSS Ex1] A: 0.68260 | B: 0.68338 | C: 0.68025
[LOGITS Ex2 A] Mean Abs: 1.078 | Max: 4.656
[LOSS Ex2] A: 0.35663 | B: 0.46104 | C: 0.41937
** [JOINT LOSS] ** : 1.094421
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002446 | Grad Max: 0.051743
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134230 | Grad Max: 0.753089
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001730 | Grad Max: 0.008105
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003588 | Grad Max: 0.003588
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001067 | Grad Max: 0.063749
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020029 | Grad Max: 0.347898
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000267 | Grad Max: 0.006850
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009334 | Grad Max: 0.034395
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000718
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002189 | Grad Max: 0.005985
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000331
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000816 | Grad Max: 0.002289
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001906 | Grad Max: 0.004360
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028088 | Grad Max: 0.028088
[GRADIENT NORM TOTAL] 2.6999

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.160
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50794715 0.49205288] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.512 | Std: 0.010
[MASKS] A(Pass/Fail): 4/2044 | B: 165/1883 | C: 233/1815
[LOSS Ex1] A: 0.68220 | B: 0.68369 | C: 0.67952
[LOGITS Ex2 A] Mean Abs: 1.071 | Max: 4.442
[LOSS Ex2] A: 0.36048 | B: 0.45762 | C: 0.41785
** [JOINT LOSS] ** : 1.093787
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004323 | Grad Max: 0.108526
  -> Layer: shared_layers.0.bias | Grad Mean: 0.216808 | Grad Max: 1.245048
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001855 | Grad Max: 0.009215
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010434 | Grad Max: 0.010434
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001775 | Grad Max: 0.103796
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033606 | Grad Max: 0.579509
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000447 | Grad Max: 0.010943
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015688 | Grad Max: 0.056864
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.001128
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003710 | Grad Max: 0.008722
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000531
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001390 | Grad Max: 0.003663
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003401 | Grad Max: 0.006679
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048490 | Grad Max: 0.048490
[GRADIENT NORM TOTAL] 4.3841

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.235
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53586906 0.46413097] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 3/2045 | B: 150/1898 | C: 235/1813
[LOSS Ex1] A: 0.68163 | B: 0.68178 | C: 0.67905
[LOGITS Ex2 A] Mean Abs: 1.090 | Max: 4.642
[LOSS Ex2] A: 0.35271 | B: 0.43174 | C: 0.40936
** [JOINT LOSS] ** : 1.078758
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002477 | Grad Max: 0.066457
  -> Layer: shared_layers.0.bias | Grad Mean: 0.096364 | Grad Max: 0.555884
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.009716
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010576 | Grad Max: 0.010576
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000797 | Grad Max: 0.047287
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014807 | Grad Max: 0.261720
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000192 | Grad Max: 0.004690
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006641 | Grad Max: 0.024433
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000478
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001560 | Grad Max: 0.004153
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000290
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000595 | Grad Max: 0.001831
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001558 | Grad Max: 0.003997
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021985 | Grad Max: 0.021985
[GRADIENT NORM TOTAL] 1.9382

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.265
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.505439   0.49456105] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 3/2045 | B: 156/1700 | C: 223/1825
[LOSS Ex1] A: 0.68314 | B: 0.68366 | C: 0.67967
[LOGITS Ex2 A] Mean Abs: 1.058 | Max: 4.719
[LOSS Ex2] A: 0.35177 | B: 0.41659 | C: 0.41951
** [JOINT LOSS] ** : 1.078117
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004553 | Grad Max: 0.128088
  -> Layer: shared_layers.0.bias | Grad Mean: 0.158670 | Grad Max: 0.885726
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.009559
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016716 | Grad Max: 0.016716
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001392 | Grad Max: 0.071994
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025273 | Grad Max: 0.400589
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000330 | Grad Max: 0.006956
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011331 | Grad Max: 0.036252
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000842
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002713 | Grad Max: 0.006218
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000411
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001018 | Grad Max: 0.002806
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002780 | Grad Max: 0.005550
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036515 | Grad Max: 0.036515
[GRADIENT NORM TOTAL] 3.2278

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.102
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5358309  0.46416909] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 0/2048 | B: 154/1894 | C: 255/1793
[LOSS Ex1] A: 0.00000 | B: 0.68330 | C: 0.67855
[LOGITS Ex2 A] Mean Abs: 1.055 | Max: 4.855
[LOSS Ex2] A: 0.35852 | B: 0.45845 | C: 0.41381
** [JOINT LOSS] ** : 0.864212
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005524 | Grad Max: 0.147295
  -> Layer: shared_layers.0.bias | Grad Mean: 0.261749 | Grad Max: 1.524750
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001123 | Grad Max: 0.005215
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007476 | Grad Max: 0.007476
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.133882
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040615 | Grad Max: 0.730334
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.011662
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018673 | Grad Max: 0.065062
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000099 | Grad Max: 0.001192
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004418 | Grad Max: 0.010157
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000627
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001660 | Grad Max: 0.004375
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004291 | Grad Max: 0.008299
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.058848 | Grad Max: 0.058848
[GRADIENT NORM TOTAL] 5.2698

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.144
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5196843 0.4803157] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/1614 | B: 168/1880 | C: 205/1843
[LOSS Ex1] A: 0.68132 | B: 0.68362 | C: 0.68106
[LOGITS Ex2 A] Mean Abs: 1.089 | Max: 4.801
[LOSS Ex2] A: 0.34803 | B: 0.44282 | C: 0.41445
** [JOINT LOSS] ** : 1.083769
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003406 | Grad Max: 0.089123
  -> Layer: shared_layers.0.bias | Grad Mean: 0.145226 | Grad Max: 0.828051
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001836 | Grad Max: 0.009263
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008031 | Grad Max: 0.008031
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001191 | Grad Max: 0.063661
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022414 | Grad Max: 0.370680
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000304 | Grad Max: 0.008073
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010478 | Grad Max: 0.038783
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000702
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002469 | Grad Max: 0.006054
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000377
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000919 | Grad Max: 0.002503
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002452 | Grad Max: 0.005520
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032830 | Grad Max: 0.032830
[GRADIENT NORM TOTAL] 2.8901

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.266
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.509238 0.490762] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/2046 | B: 151/1897 | C: 249/1799
[LOSS Ex1] A: 0.68170 | B: 0.68169 | C: 0.67778
[LOGITS Ex2 A] Mean Abs: 1.122 | Max: 6.306
[LOSS Ex2] A: 0.35723 | B: 0.42676 | C: 0.40834
** [JOINT LOSS] ** : 1.077832
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004634 | Grad Max: 0.100497
  -> Layer: shared_layers.0.bias | Grad Mean: 0.173893 | Grad Max: 0.989522
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001986 | Grad Max: 0.008983
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001827 | Grad Max: 0.001827
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001492 | Grad Max: 0.074146
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027966 | Grad Max: 0.404381
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000363 | Grad Max: 0.009133
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012626 | Grad Max: 0.046685
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000821
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002987 | Grad Max: 0.007175
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000425
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001118 | Grad Max: 0.002898
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002849 | Grad Max: 0.006157
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039589 | Grad Max: 0.039589
[GRADIENT NORM TOTAL] 3.5310

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.255
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015807 0.4984193] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/2046 | B: 156/1700 | C: 171/1205
[LOSS Ex1] A: 0.68139 | B: 0.68358 | C: 0.67757
[LOGITS Ex2 A] Mean Abs: 1.135 | Max: 4.289
[LOSS Ex2] A: 0.36016 | B: 0.43713 | C: 0.38240
** [JOINT LOSS] ** : 1.074075
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006461 | Grad Max: 0.159487
  -> Layer: shared_layers.0.bias | Grad Mean: 0.270108 | Grad Max: 1.588480
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002032 | Grad Max: 0.010222
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015090 | Grad Max: 0.015090
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002261 | Grad Max: 0.109852
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042925 | Grad Max: 0.601425
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000564 | Grad Max: 0.013670
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019655 | Grad Max: 0.071287
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001325
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004634 | Grad Max: 0.011158
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000055 | Grad Max: 0.000678
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001741 | Grad Max: 0.004725
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004467 | Grad Max: 0.008718
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.062003 | Grad Max: 0.062003
[GRADIENT NORM TOTAL] 5.4593

[EPOCH SUMMARY] Train Loss: 1.0516

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0572 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0641 -> New: 1.0572)

############################## EPOCH 23/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.261
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50730497 0.492695  ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 6/2042 | B: 156/1892 | C: 267/1781
[LOSS Ex1] A: 0.68050 | B: 0.68322 | C: 0.67606
[LOGITS Ex2 A] Mean Abs: 1.109 | Max: 5.238
[LOSS Ex2] A: 0.36415 | B: 0.45150 | C: 0.43027
** [JOINT LOSS] ** : 1.095234
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005123 | Grad Max: 0.134992
  -> Layer: shared_layers.0.bias | Grad Mean: 0.161182 | Grad Max: 0.906454
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.009538
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004126 | Grad Max: 0.004126
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001396 | Grad Max: 0.067089
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026093 | Grad Max: 0.335241
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000343 | Grad Max: 0.009482
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011749 | Grad Max: 0.048084
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000801
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002778 | Grad Max: 0.006571
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000427
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001030 | Grad Max: 0.002796
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002592 | Grad Max: 0.005250
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035586 | Grad Max: 0.035586
[GRADIENT NORM TOTAL] 3.2385

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.228
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50918573 0.49081424] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 3/2045 | B: 168/1880 | C: 223/1825
[LOSS Ex1] A: 0.68246 | B: 0.68354 | C: 0.68091
[LOGITS Ex2 A] Mean Abs: 1.040 | Max: 5.031
[LOSS Ex2] A: 0.35954 | B: 0.44352 | C: 0.42824
** [JOINT LOSS] ** : 1.092738
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003122 | Grad Max: 0.075464
  -> Layer: shared_layers.0.bias | Grad Mean: 0.142882 | Grad Max: 0.873323
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001773 | Grad Max: 0.008816
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009772 | Grad Max: 0.009772
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001217 | Grad Max: 0.092670
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022880 | Grad Max: 0.529027
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000299 | Grad Max: 0.008136
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010440 | Grad Max: 0.040381
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000713
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002451 | Grad Max: 0.006162
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000382
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000920 | Grad Max: 0.002681
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002457 | Grad Max: 0.005330
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032940 | Grad Max: 0.032940
[GRADIENT NORM TOTAL] 3.0035

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.161
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082023  0.49179772] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 5/2043 | B: 152/1896 | C: 244/1804
[LOSS Ex1] A: 0.68206 | B: 0.68161 | C: 0.67851
[LOGITS Ex2 A] Mean Abs: 1.018 | Max: 4.800
[LOSS Ex2] A: 0.35070 | B: 0.42319 | C: 0.39795
** [JOINT LOSS] ** : 1.071342
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004931 | Grad Max: 0.127516
  -> Layer: shared_layers.0.bias | Grad Mean: 0.234615 | Grad Max: 1.315274
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001881 | Grad Max: 0.008601
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002700 | Grad Max: 0.002700
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001920 | Grad Max: 0.104016
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036381 | Grad Max: 0.575461
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000484 | Grad Max: 0.011028
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016938 | Grad Max: 0.058081
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001076
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003981 | Grad Max: 0.009098
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000548
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001484 | Grad Max: 0.003854
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003793 | Grad Max: 0.007835
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051802 | Grad Max: 0.051802
[GRADIENT NORM TOTAL] 4.6856

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.237
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53661335 0.46338665] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 3/2045 | B: 157/1699 | C: 261/1787
[LOSS Ex1] A: 0.68146 | B: 0.68351 | C: 0.67818
[LOGITS Ex2 A] Mean Abs: 1.073 | Max: 4.766
[LOSS Ex2] A: 0.32734 | B: 0.40819 | C: 0.40441
** [JOINT LOSS] ** : 1.061030
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.060042
  -> Layer: shared_layers.0.bias | Grad Mean: 0.127312 | Grad Max: 0.687334
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001931 | Grad Max: 0.009782
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011640 | Grad Max: 0.011640
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001028 | Grad Max: 0.054621
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019191 | Grad Max: 0.302755
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.006130
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009034 | Grad Max: 0.032004
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000671
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002131 | Grad Max: 0.005466
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000320
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000787 | Grad Max: 0.002155
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001974 | Grad Max: 0.005511
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026874 | Grad Max: 0.026874
[GRADIENT NORM TOTAL] 2.5333

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.267
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50536186 0.49463814] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 3/2045 | B: 157/1891 | C: 229/1819
[LOSS Ex1] A: 0.68300 | B: 0.68315 | C: 0.68010
[LOGITS Ex2 A] Mean Abs: 1.109 | Max: 4.545
[LOSS Ex2] A: 0.34968 | B: 0.45285 | C: 0.40700
** [JOINT LOSS] ** : 1.085255
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003208 | Grad Max: 0.080885
  -> Layer: shared_layers.0.bias | Grad Mean: 0.175309 | Grad Max: 1.052779
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001812 | Grad Max: 0.008680
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010630 | Grad Max: 0.010630
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001396 | Grad Max: 0.082477
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026940 | Grad Max: 0.450518
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.010607
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012598 | Grad Max: 0.048764
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000853
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002931 | Grad Max: 0.007241
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000430
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001093 | Grad Max: 0.003025
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002648 | Grad Max: 0.005377
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037911 | Grad Max: 0.037911
[GRADIENT NORM TOTAL] 3.5679

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.104
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5364586 0.4635414] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/2046 | B: 169/1879 | C: 214/1834
[LOSS Ex1] A: 0.68193 | B: 0.68347 | C: 0.68147
[LOGITS Ex2 A] Mean Abs: 1.129 | Max: 4.474
[LOSS Ex2] A: 0.36081 | B: 0.45058 | C: 0.40625
** [JOINT LOSS] ** : 1.088163
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006110 | Grad Max: 0.144825
  -> Layer: shared_layers.0.bias | Grad Mean: 0.276859 | Grad Max: 1.653459
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001845 | Grad Max: 0.009412
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012852 | Grad Max: 0.012852
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002325 | Grad Max: 0.118838
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043801 | Grad Max: 0.663423
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000566 | Grad Max: 0.013760
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019751 | Grad Max: 0.068547
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001432
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004644 | Grad Max: 0.011519
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000055 | Grad Max: 0.000666
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001732 | Grad Max: 0.004435
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004330 | Grad Max: 0.008344
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.060178 | Grad Max: 0.060178
[GRADIENT NORM TOTAL] 5.6519

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.146
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52012134 0.47987872] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/1614 | B: 152/1896 | C: 263/1785
[LOSS Ex1] A: 0.68116 | B: 0.68153 | C: 0.67714
[LOGITS Ex2 A] Mean Abs: 1.146 | Max: 4.559
[LOSS Ex2] A: 0.33818 | B: 0.42638 | C: 0.40965
** [JOINT LOSS] ** : 1.071349
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002416 | Grad Max: 0.059987
  -> Layer: shared_layers.0.bias | Grad Mean: 0.127263 | Grad Max: 0.740647
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.008958
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000095 | Grad Max: 0.000095
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001020 | Grad Max: 0.081629
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019382 | Grad Max: 0.463103
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.006045
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008991 | Grad Max: 0.031007
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000572
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002098 | Grad Max: 0.004899
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000313
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000787 | Grad Max: 0.002281
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001785 | Grad Max: 0.003977
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027237 | Grad Max: 0.027237
[GRADIENT NORM TOTAL] 2.6094

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.268
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5092207 0.4907793] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/2046 | B: 159/1697 | C: 249/1799
[LOSS Ex1] A: 0.68153 | B: 0.68344 | C: 0.67904
[LOGITS Ex2 A] Mean Abs: 1.097 | Max: 5.145
[LOSS Ex2] A: 0.34634 | B: 0.42282 | C: 0.38381
** [JOINT LOSS] ** : 1.065659
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003504 | Grad Max: 0.088789
  -> Layer: shared_layers.0.bias | Grad Mean: 0.124085 | Grad Max: 0.720976
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.010128
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018072 | Grad Max: 0.018072
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001097 | Grad Max: 0.053525
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020447 | Grad Max: 0.295581
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000266 | Grad Max: 0.006112
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009235 | Grad Max: 0.032819
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000606
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002174 | Grad Max: 0.005052
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000334
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000800 | Grad Max: 0.002313
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002096 | Grad Max: 0.004965
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027565 | Grad Max: 0.027565
[GRADIENT NORM TOTAL] 2.5361

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.257
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50149214 0.4985078 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/2046 | B: 158/1890 | C: 230/1818
[LOSS Ex1] A: 0.68120 | B: 0.68308 | C: 0.67914
[LOGITS Ex2 A] Mean Abs: 1.081 | Max: 5.476
[LOSS Ex2] A: 0.34929 | B: 0.46600 | C: 0.42607
** [JOINT LOSS] ** : 1.094927
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004754 | Grad Max: 0.104565
  -> Layer: shared_layers.0.bias | Grad Mean: 0.202685 | Grad Max: 1.161020
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.010375
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015991 | Grad Max: 0.015991
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001711 | Grad Max: 0.101170
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032033 | Grad Max: 0.559006
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000412 | Grad Max: 0.010159
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014440 | Grad Max: 0.051688
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.001023
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003432 | Grad Max: 0.008354
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000524
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001291 | Grad Max: 0.003547
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003360 | Grad Max: 0.006942
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045876 | Grad Max: 0.045876
[GRADIENT NORM TOTAL] 4.0878

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.263
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50723255 0.49276745] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 6/2042 | B: 170/1878 | C: 228/1820
[LOSS Ex1] A: 0.68032 | B: 0.68340 | C: 0.67909
[LOGITS Ex2 A] Mean Abs: 1.082 | Max: 4.687
[LOSS Ex2] A: 0.34881 | B: 0.43449 | C: 0.41455
** [JOINT LOSS] ** : 1.080215
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.055829
  -> Layer: shared_layers.0.bias | Grad Mean: 0.074723 | Grad Max: 0.483982
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001953 | Grad Max: 0.009645
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008926 | Grad Max: 0.008926
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000672 | Grad Max: 0.048698
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011689 | Grad Max: 0.267615
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.004006
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005018 | Grad Max: 0.019550
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000391
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001172 | Grad Max: 0.003126
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000205
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000435 | Grad Max: 0.001231
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001112 | Grad Max: 0.003244
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015744 | Grad Max: 0.015744
[GRADIENT NORM TOTAL] 1.5693

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.229
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50913477 0.49086523] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 3/2045 | B: 154/1894 | C: 228/1820
[LOSS Ex1] A: 0.68231 | B: 0.68145 | C: 0.67834
[LOGITS Ex2 A] Mean Abs: 1.101 | Max: 4.554
[LOSS Ex2] A: 0.34866 | B: 0.43836 | C: 0.40435
** [JOINT LOSS] ** : 1.077823
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004630 | Grad Max: 0.116870
  -> Layer: shared_layers.0.bias | Grad Mean: 0.205758 | Grad Max: 1.181603
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001864 | Grad Max: 0.008241
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001311 | Grad Max: 0.001311
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001696 | Grad Max: 0.093407
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032136 | Grad Max: 0.522941
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000424 | Grad Max: 0.010538
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014777 | Grad Max: 0.053937
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000918
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003453 | Grad Max: 0.007942
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000535
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001286 | Grad Max: 0.003661
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003147 | Grad Max: 0.006693
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044823 | Grad Max: 0.044823
[GRADIENT NORM TOTAL] 4.1581

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.162
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084509  0.49154907] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 5/2043 | B: 159/1697 | C: 236/1812
[LOSS Ex1] A: 0.68190 | B: 0.68336 | C: 0.67997
[LOGITS Ex2 A] Mean Abs: 1.104 | Max: 4.542
[LOSS Ex2] A: 0.36188 | B: 0.43899 | C: 0.42175
** [JOINT LOSS] ** : 1.089285
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005803 | Grad Max: 0.134861
  -> Layer: shared_layers.0.bias | Grad Mean: 0.270687 | Grad Max: 1.551564
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001755 | Grad Max: 0.008655
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005047 | Grad Max: 0.005047
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.114568
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041495 | Grad Max: 0.643473
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000546 | Grad Max: 0.012120
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019141 | Grad Max: 0.064964
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001297
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004485 | Grad Max: 0.010512
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000646
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001669 | Grad Max: 0.004399
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004182 | Grad Max: 0.008199
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.058227 | Grad Max: 0.058227
[GRADIENT NORM TOTAL] 5.3684

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.240
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5374016  0.46259835] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 4/2044 | B: 159/1889 | C: 209/1839
[LOSS Ex1] A: 0.68127 | B: 0.68300 | C: 0.67999
[LOGITS Ex2 A] Mean Abs: 1.118 | Max: 4.854
[LOSS Ex2] A: 0.34661 | B: 0.45333 | C: 0.42443
** [JOINT LOSS] ** : 1.089543
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003652 | Grad Max: 0.099446
  -> Layer: shared_layers.0.bias | Grad Mean: 0.160824 | Grad Max: 0.976620
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.010062
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018914 | Grad Max: 0.018914
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001342 | Grad Max: 0.071363
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025257 | Grad Max: 0.396722
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000336 | Grad Max: 0.007776
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011813 | Grad Max: 0.040404
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000770
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002778 | Grad Max: 0.006808
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000394
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001030 | Grad Max: 0.002741
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002474 | Grad Max: 0.004747
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035247 | Grad Max: 0.035247
[GRADIENT NORM TOTAL] 3.3180

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.270
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5052959 0.4947041] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 3/2045 | B: 171/1877 | C: 165/1211
[LOSS Ex1] A: 0.68283 | B: 0.68332 | C: 0.67916
[LOGITS Ex2 A] Mean Abs: 1.088 | Max: 4.635
[LOSS Ex2] A: 0.34082 | B: 0.43442 | C: 0.42400
** [JOINT LOSS] ** : 1.081518
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005606 | Grad Max: 0.159080
  -> Layer: shared_layers.0.bias | Grad Mean: 0.142657 | Grad Max: 0.736171
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001917 | Grad Max: 0.009620
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017887 | Grad Max: 0.017887
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001273 | Grad Max: 0.063002
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022689 | Grad Max: 0.329573
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.007124
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009630 | Grad Max: 0.035300
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000780
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002320 | Grad Max: 0.005474
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000366
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000859 | Grad Max: 0.002322
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002252 | Grad Max: 0.004933
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029398 | Grad Max: 0.029398
[GRADIENT NORM TOTAL] 2.8291

[EPOCH SUMMARY] Train Loss: 1.0817

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0599 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 24/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.105
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53719544 0.4628045 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 3/2045 | B: 156/1892 | C: 229/1819
[LOSS Ex1] A: 0.68175 | B: 0.68136 | C: 0.67941
[LOGITS Ex2 A] Mean Abs: 1.084 | Max: 4.936
[LOSS Ex2] A: 0.36269 | B: 0.42657 | C: 0.41507
** [JOINT LOSS] ** : 1.082281
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005779 | Grad Max: 0.156171
  -> Layer: shared_layers.0.bias | Grad Mean: 0.235898 | Grad Max: 1.361046
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001896 | Grad Max: 0.008961
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005831 | Grad Max: 0.005831
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002016 | Grad Max: 0.111760
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037954 | Grad Max: 0.631322
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000497 | Grad Max: 0.011450
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017344 | Grad Max: 0.060775
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001154
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004106 | Grad Max: 0.009510
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000599
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001530 | Grad Max: 0.004175
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004014 | Grad Max: 0.007576
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.054429 | Grad Max: 0.054429
[GRADIENT NORM TOTAL] 4.7675

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.148
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5206508  0.47934923] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/1614 | B: 160/1696 | C: 223/1825
[LOSS Ex1] A: 0.68098 | B: 0.68329 | C: 0.68014
[LOGITS Ex2 A] Mean Abs: 1.104 | Max: 4.838
[LOSS Ex2] A: 0.34186 | B: 0.42207 | C: 0.40549
** [JOINT LOSS] ** : 1.071272
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003545 | Grad Max: 0.088657
  -> Layer: shared_layers.0.bias | Grad Mean: 0.143906 | Grad Max: 0.787755
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001871 | Grad Max: 0.009759
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009751 | Grad Max: 0.009751
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001177 | Grad Max: 0.091146
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021577 | Grad Max: 0.498778
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000281 | Grad Max: 0.007044
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009806 | Grad Max: 0.037234
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000822
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002332 | Grad Max: 0.006447
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000319
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000859 | Grad Max: 0.002217
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002254 | Grad Max: 0.005331
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030139 | Grad Max: 0.030139
[GRADIENT NORM TOTAL] 2.8856

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.271
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50915897 0.49084103] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.012
[MASKS] A(Pass/Fail): 2/2046 | B: 160/1888 | C: 242/1806
[LOSS Ex1] A: 0.68134 | B: 0.68292 | C: 0.67935
[LOGITS Ex2 A] Mean Abs: 1.131 | Max: 6.331
[LOSS Ex2] A: 0.34749 | B: 0.45213 | C: 0.40974
** [JOINT LOSS] ** : 1.084322
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003417 | Grad Max: 0.075436
  -> Layer: shared_layers.0.bias | Grad Mean: 0.125710 | Grad Max: 0.759782
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001874 | Grad Max: 0.009211
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008991 | Grad Max: 0.008991
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001075 | Grad Max: 0.081204
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019848 | Grad Max: 0.453262
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000251 | Grad Max: 0.007646
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008755 | Grad Max: 0.040218
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000822
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002061 | Grad Max: 0.006558
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000358
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000766 | Grad Max: 0.002317
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001890 | Grad Max: 0.004055
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026532 | Grad Max: 0.026532
[GRADIENT NORM TOTAL] 2.5679

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.260
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50131977 0.49868017] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 2/2046 | B: 172/1876 | C: 246/1802
[LOSS Ex1] A: 0.68101 | B: 0.68324 | C: 0.67880
[LOGITS Ex2 A] Mean Abs: 1.141 | Max: 4.606
[LOSS Ex2] A: 0.35623 | B: 0.44654 | C: 0.40406
** [JOINT LOSS] ** : 1.083287
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005331 | Grad Max: 0.138473
  -> Layer: shared_layers.0.bias | Grad Mean: 0.234160 | Grad Max: 1.371714
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.010696
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019404 | Grad Max: 0.019404
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.113130
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036972 | Grad Max: 0.610887
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.012103
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016765 | Grad Max: 0.061467
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.001060
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003927 | Grad Max: 0.009582
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000537
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001461 | Grad Max: 0.003896
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003589 | Grad Max: 0.007127
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050584 | Grad Max: 0.050584
[GRADIENT NORM TOTAL] 4.7647

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.266
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070989  0.49290103] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.012
[MASKS] A(Pass/Fail): 7/2041 | B: 157/1891 | C: 239/1809
[LOSS Ex1] A: 0.68012 | B: 0.68128 | C: 0.67887
[LOGITS Ex2 A] Mean Abs: 1.121 | Max: 5.463
[LOSS Ex2] A: 0.34508 | B: 0.42127 | C: 0.41714
** [JOINT LOSS] ** : 1.074588
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004324 | Grad Max: 0.124000
  -> Layer: shared_layers.0.bias | Grad Mean: 0.110084 | Grad Max: 0.600424
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.009860
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008881 | Grad Max: 0.008881
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001014 | Grad Max: 0.051194
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018416 | Grad Max: 0.255523
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.006045
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007798 | Grad Max: 0.032367
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000542
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001858 | Grad Max: 0.004671
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000318
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000697 | Grad Max: 0.002020
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001725 | Grad Max: 0.004311
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024042 | Grad Max: 0.024042
[GRADIENT NORM TOTAL] 2.2659

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.231
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5090434 0.4909566] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 4/2044 | B: 160/1696 | C: 220/1828
[LOSS Ex1] A: 0.68215 | B: 0.68321 | C: 0.68025
[LOGITS Ex2 A] Mean Abs: 1.065 | Max: 4.595
[LOSS Ex2] A: 0.33837 | B: 0.42728 | C: 0.42476
** [JOINT LOSS] ** : 1.078676
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003368 | Grad Max: 0.095551
  -> Layer: shared_layers.0.bias | Grad Mean: 0.195502 | Grad Max: 1.123464
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001793 | Grad Max: 0.008783
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010798 | Grad Max: 0.010798
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001609 | Grad Max: 0.093884
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030371 | Grad Max: 0.542695
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.009561
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014124 | Grad Max: 0.048398
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000877
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003310 | Grad Max: 0.007623
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000448
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001230 | Grad Max: 0.003221
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003149 | Grad Max: 0.006619
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043132 | Grad Max: 0.043132
[GRADIENT NORM TOTAL] 4.0290

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.163
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50874764 0.49125236] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 7/2041 | B: 160/1888 | C: 264/1784
[LOSS Ex1] A: 0.68174 | B: 0.68284 | C: 0.67739
[LOGITS Ex2 A] Mean Abs: 1.034 | Max: 5.234
[LOSS Ex2] A: 0.36667 | B: 0.46613 | C: 0.42197
** [JOINT LOSS] ** : 1.098917
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005445 | Grad Max: 0.142066
  -> Layer: shared_layers.0.bias | Grad Mean: 0.267396 | Grad Max: 1.584014
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001853 | Grad Max: 0.008849
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004828 | Grad Max: 0.004828
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002238 | Grad Max: 0.118913
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042048 | Grad Max: 0.664049
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000551 | Grad Max: 0.013448
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019375 | Grad Max: 0.068838
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001287
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004560 | Grad Max: 0.011192
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000635
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001707 | Grad Max: 0.004382
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004362 | Grad Max: 0.008337
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.060548 | Grad Max: 0.060548
[GRADIENT NORM TOTAL] 5.4516

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.242
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5382747 0.4617253] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 6/2042 | B: 174/1874 | C: 251/1797
[LOSS Ex1] A: 0.68107 | B: 0.68317 | C: 0.67726
[LOGITS Ex2 A] Mean Abs: 1.085 | Max: 4.626
[LOSS Ex2] A: 0.32557 | B: 0.43986 | C: 0.40084
** [JOINT LOSS] ** : 1.069260
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003773 | Grad Max: 0.095078
  -> Layer: shared_layers.0.bias | Grad Mean: 0.164695 | Grad Max: 0.964963
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001988 | Grad Max: 0.009767
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013171 | Grad Max: 0.013171
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001393 | Grad Max: 0.075752
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026155 | Grad Max: 0.399335
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000350 | Grad Max: 0.009505
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012242 | Grad Max: 0.046500
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000887
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002858 | Grad Max: 0.007058
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000405
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001056 | Grad Max: 0.002850
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002738 | Grad Max: 0.005976
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036665 | Grad Max: 0.036665
[GRADIENT NORM TOTAL] 3.3658

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.273
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5051664 0.4948336] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.012
[MASKS] A(Pass/Fail): 3/2045 | B: 157/1891 | C: 234/1814
[LOSS Ex1] A: 0.68268 | B: 0.68120 | C: 0.67909
[LOGITS Ex2 A] Mean Abs: 1.105 | Max: 4.609
[LOSS Ex2] A: 0.33773 | B: 0.42479 | C: 0.39714
** [JOINT LOSS] ** : 1.067543
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.062917
  -> Layer: shared_layers.0.bias | Grad Mean: 0.148141 | Grad Max: 0.843071
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001852 | Grad Max: 0.008602
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010285 | Grad Max: 0.010285
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001115 | Grad Max: 0.080057
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021397 | Grad Max: 0.464312
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.007925
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010103 | Grad Max: 0.038237
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000678
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002352 | Grad Max: 0.005542
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000367
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000882 | Grad Max: 0.002380
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002015 | Grad Max: 0.004701
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030295 | Grad Max: 0.030295
[GRADIENT NORM TOTAL] 3.0180

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.107
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53793776 0.4620622 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 4/2044 | B: 160/1696 | C: 225/1823
[LOSS Ex1] A: 0.68158 | B: 0.68314 | C: 0.67868
[LOGITS Ex2 A] Mean Abs: 1.138 | Max: 4.759
[LOSS Ex2] A: 0.34235 | B: 0.43590 | C: 0.40295
** [JOINT LOSS] ** : 1.074864
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004622 | Grad Max: 0.105975
  -> Layer: shared_layers.0.bias | Grad Mean: 0.216424 | Grad Max: 1.275622
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001839 | Grad Max: 0.009111
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008047 | Grad Max: 0.008047
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001746 | Grad Max: 0.101602
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033165 | Grad Max: 0.569627
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.010862
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015194 | Grad Max: 0.054520
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001021
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003555 | Grad Max: 0.008583
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000503
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001323 | Grad Max: 0.003653
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003260 | Grad Max: 0.006586
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046741 | Grad Max: 0.046741
[GRADIENT NORM TOTAL] 4.3856

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.150
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.521187 0.478813] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 3/1613 | B: 161/1887 | C: 232/1816
[LOSS Ex1] A: 0.68080 | B: 0.68277 | C: 0.67960
[LOGITS Ex2 A] Mean Abs: 1.164 | Max: 5.040
[LOSS Ex2] A: 0.32899 | B: 0.44851 | C: 0.39416
** [JOINT LOSS] ** : 1.071608
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002371 | Grad Max: 0.048704
  -> Layer: shared_layers.0.bias | Grad Mean: 0.106709 | Grad Max: 0.642281
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001871 | Grad Max: 0.009273
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007015 | Grad Max: 0.007015
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000863 | Grad Max: 0.044347
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016138 | Grad Max: 0.250014
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000210 | Grad Max: 0.005371
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007436 | Grad Max: 0.027384
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000607
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001726 | Grad Max: 0.004476
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000281
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000645 | Grad Max: 0.001766
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001568 | Grad Max: 0.003841
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023117 | Grad Max: 0.023117
[GRADIENT NORM TOTAL] 2.1283

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.273
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5090752  0.49092472] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 2/2046 | B: 175/1873 | C: 249/1799
[LOSS Ex1] A: 0.68116 | B: 0.68309 | C: 0.67758
[LOGITS Ex2 A] Mean Abs: 1.107 | Max: 5.925
[LOSS Ex2] A: 0.33903 | B: 0.43310 | C: 0.39019
** [JOINT LOSS] ** : 1.068052
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003802 | Grad Max: 0.086686
  -> Layer: shared_layers.0.bias | Grad Mean: 0.142097 | Grad Max: 0.814293
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001914 | Grad Max: 0.009382
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010236 | Grad Max: 0.010236
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001198 | Grad Max: 0.062082
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022530 | Grad Max: 0.334883
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000294 | Grad Max: 0.008182
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010266 | Grad Max: 0.037962
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000763
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002411 | Grad Max: 0.006010
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000390
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000886 | Grad Max: 0.002501
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002340 | Grad Max: 0.005571
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030911 | Grad Max: 0.030911
[GRADIENT NORM TOTAL] 2.8513

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.262
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50117713 0.49882287] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.012
[MASKS] A(Pass/Fail): 2/2046 | B: 157/1891 | C: 246/1802
[LOSS Ex1] A: 0.68080 | B: 0.68112 | C: 0.67795
[LOGITS Ex2 A] Mean Abs: 1.089 | Max: 4.684
[LOSS Ex2] A: 0.32890 | B: 0.42163 | C: 0.41786
** [JOINT LOSS] ** : 1.069418
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003918 | Grad Max: 0.098783
  -> Layer: shared_layers.0.bias | Grad Mean: 0.196867 | Grad Max: 1.109958
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.010111
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011786 | Grad Max: 0.011786
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001565 | Grad Max: 0.090082
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029422 | Grad Max: 0.503554
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.010091
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013748 | Grad Max: 0.050058
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000907
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003231 | Grad Max: 0.007859
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000510
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001198 | Grad Max: 0.003182
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003081 | Grad Max: 0.006310
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042254 | Grad Max: 0.042254
[GRADIENT NORM TOTAL] 3.9163

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.268
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50697035 0.49302968] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.012
[MASKS] A(Pass/Fail): 7/2041 | B: 160/1696 | C: 162/1214
[LOSS Ex1] A: 0.67992 | B: 0.68306 | C: 0.67894
[LOGITS Ex2 A] Mean Abs: 1.099 | Max: 4.947
[LOSS Ex2] A: 0.34784 | B: 0.41105 | C: 0.40641
** [JOINT LOSS] ** : 1.069074
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002147 | Grad Max: 0.057548
  -> Layer: shared_layers.0.bias | Grad Mean: 0.076734 | Grad Max: 0.405680
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.009995
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010167 | Grad Max: 0.010167
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000644 | Grad Max: 0.037646
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010847 | Grad Max: 0.208135
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.004111
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004279 | Grad Max: 0.017468
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000377
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000973 | Grad Max: 0.002720
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000182
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000359 | Grad Max: 0.001122
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000930 | Grad Max: 0.003295
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012887 | Grad Max: 0.012887
[GRADIENT NORM TOTAL] 1.5220

[EPOCH SUMMARY] Train Loss: 1.0759

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0547 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0572 -> New: 1.0547)

############################## EPOCH 25/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.233
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5089481 0.4910519] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 4/2044 | B: 161/1887 | C: 229/1819
[LOSS Ex1] A: 0.68199 | B: 0.68268 | C: 0.68025
[LOGITS Ex2 A] Mean Abs: 1.128 | Max: 4.698
[LOSS Ex2] A: 0.35326 | B: 0.45296 | C: 0.41808
** [JOINT LOSS] ** : 1.089742
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004843 | Grad Max: 0.108671
  -> Layer: shared_layers.0.bias | Grad Mean: 0.230304 | Grad Max: 1.293460
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001813 | Grad Max: 0.008688
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010947 | Grad Max: 0.010947
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001801 | Grad Max: 0.093676
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034280 | Grad Max: 0.519659
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000449 | Grad Max: 0.011896
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015904 | Grad Max: 0.062022
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000965
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003707 | Grad Max: 0.008567
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000508
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001368 | Grad Max: 0.003656
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003271 | Grad Max: 0.006419
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047072 | Grad Max: 0.047072
[GRADIENT NORM TOTAL] 4.5141

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.164
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5090251 0.4909749] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.011
[MASKS] A(Pass/Fail): 7/2041 | B: 175/1873 | C: 240/1808
[LOSS Ex1] A: 0.68158 | B: 0.68301 | C: 0.67904
[LOGITS Ex2 A] Mean Abs: 1.110 | Max: 4.626
[LOSS Ex2] A: 0.35067 | B: 0.44416 | C: 0.41265
** [JOINT LOSS] ** : 1.083704
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005661 | Grad Max: 0.142322
  -> Layer: shared_layers.0.bias | Grad Mean: 0.278661 | Grad Max: 1.562868
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001806 | Grad Max: 0.008975
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006337 | Grad Max: 0.006337
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.125159
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042120 | Grad Max: 0.698345
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000555 | Grad Max: 0.013763
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019571 | Grad Max: 0.071753
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001243
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004580 | Grad Max: 0.011031
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000637
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001695 | Grad Max: 0.004275
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004195 | Grad Max: 0.007837
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.058820 | Grad Max: 0.058820
[GRADIENT NORM TOTAL] 5.5238

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.244
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5391501  0.46084985] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 6/2042 | B: 157/1891 | C: 231/1817
[LOSS Ex1] A: 0.68087 | B: 0.68103 | C: 0.67939
[LOGITS Ex2 A] Mean Abs: 1.126 | Max: 4.938
[LOSS Ex2] A: 0.33647 | B: 0.42158 | C: 0.42382
** [JOINT LOSS] ** : 1.074385
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003219 | Grad Max: 0.077269
  -> Layer: shared_layers.0.bias | Grad Mean: 0.147166 | Grad Max: 0.847878
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002030 | Grad Max: 0.010012
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015489 | Grad Max: 0.015489
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001183 | Grad Max: 0.064464
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022367 | Grad Max: 0.367199
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000295 | Grad Max: 0.006500
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010389 | Grad Max: 0.037420
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000747
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002416 | Grad Max: 0.006029
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000334
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000895 | Grad Max: 0.002371
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002098 | Grad Max: 0.004960
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030877 | Grad Max: 0.030877
[GRADIENT NORM TOTAL] 2.9663

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.275
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50507116 0.49492887] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 3/2045 | B: 161/1695 | C: 213/1835
[LOSS Ex1] A: 0.68250 | B: 0.68297 | C: 0.68031
[LOGITS Ex2 A] Mean Abs: 1.105 | Max: 4.679
[LOSS Ex2] A: 0.33248 | B: 0.42202 | C: 0.43180
** [JOINT LOSS] ** : 1.077358
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006064 | Grad Max: 0.169343
  -> Layer: shared_layers.0.bias | Grad Mean: 0.222752 | Grad Max: 1.245109
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001713 | Grad Max: 0.008382
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007349 | Grad Max: 0.007349
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001934 | Grad Max: 0.108808
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035697 | Grad Max: 0.617060
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.011800
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016006 | Grad Max: 0.061284
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.001170
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003769 | Grad Max: 0.009130
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000551
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001388 | Grad Max: 0.003699
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003591 | Grad Max: 0.006955
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048568 | Grad Max: 0.048568
[GRADIENT NORM TOTAL] 4.4921

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.109
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5387489 0.4612511] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 4/2044 | B: 162/1886 | C: 257/1791
[LOSS Ex1] A: 0.68139 | B: 0.68260 | C: 0.67743
[LOGITS Ex2 A] Mean Abs: 1.090 | Max: 4.787
[LOSS Ex2] A: 0.35660 | B: 0.45194 | C: 0.43048
** [JOINT LOSS] ** : 1.093479
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007445 | Grad Max: 0.193810
  -> Layer: shared_layers.0.bias | Grad Mean: 0.307302 | Grad Max: 1.745826
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001868 | Grad Max: 0.008796
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001639 | Grad Max: 0.001639
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002630 | Grad Max: 0.205326
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049626 | Grad Max: 1.151001
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000642 | Grad Max: 0.016266
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022569 | Grad Max: 0.081059
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000116 | Grad Max: 0.001464
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005293 | Grad Max: 0.012205
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000061 | Grad Max: 0.000717
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001964 | Grad Max: 0.005143
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005063 | Grad Max: 0.009782
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.069296 | Grad Max: 0.069296
[GRADIENT NORM TOTAL] 6.3141

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.152
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52175844 0.47824153] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 3/1613 | B: 177/1871 | C: 244/1804
[LOSS Ex1] A: 0.68060 | B: 0.68293 | C: 0.67705
[LOGITS Ex2 A] Mean Abs: 1.129 | Max: 4.950
[LOSS Ex2] A: 0.33564 | B: 0.44042 | C: 0.40237
** [JOINT LOSS] ** : 1.073005
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005441 | Grad Max: 0.134084
  -> Layer: shared_layers.0.bias | Grad Mean: 0.185278 | Grad Max: 1.068156
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001974 | Grad Max: 0.009737
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009575 | Grad Max: 0.009575
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001634 | Grad Max: 0.128289
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030397 | Grad Max: 0.683243
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000391 | Grad Max: 0.010064
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013663 | Grad Max: 0.050444
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000876
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003211 | Grad Max: 0.007293
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000517
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001186 | Grad Max: 0.003270
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003212 | Grad Max: 0.006562
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042219 | Grad Max: 0.042219
[GRADIENT NORM TOTAL] 3.8085

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.276
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50900453 0.49099547] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 3/2045 | B: 161/1887 | C: 247/1801
[LOSS Ex1] A: 0.68096 | B: 0.68094 | C: 0.67762
[LOGITS Ex2 A] Mean Abs: 1.166 | Max: 4.441
[LOSS Ex2] A: 0.33323 | B: 0.42764 | C: 0.39570
** [JOINT LOSS] ** : 1.065363
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002594 | Grad Max: 0.054757
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141451 | Grad Max: 0.822219
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001976 | Grad Max: 0.008985
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005475 | Grad Max: 0.005475
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001142 | Grad Max: 0.088225
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021273 | Grad Max: 0.498815
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.006333
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009781 | Grad Max: 0.035736
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000621
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002288 | Grad Max: 0.005705
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000335
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000851 | Grad Max: 0.002294
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001991 | Grad Max: 0.004639
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029145 | Grad Max: 0.029145
[GRADIENT NORM TOTAL] 2.9138

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.264
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010153 0.4989847] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 2/2046 | B: 161/1695 | C: 238/1810
[LOSS Ex1] A: 0.68059 | B: 0.68290 | C: 0.67958
[LOGITS Ex2 A] Mean Abs: 1.163 | Max: 4.388
[LOSS Ex2] A: 0.34484 | B: 0.42515 | C: 0.40298
** [JOINT LOSS] ** : 1.072014
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004691 | Grad Max: 0.109689
  -> Layer: shared_layers.0.bias | Grad Mean: 0.233186 | Grad Max: 1.348753
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.010687
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022287 | Grad Max: 0.022287
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001887 | Grad Max: 0.105074
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036115 | Grad Max: 0.595891
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000477 | Grad Max: 0.011207
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016920 | Grad Max: 0.057218
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001038
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003937 | Grad Max: 0.009340
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000045 | Grad Max: 0.000573
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001460 | Grad Max: 0.003957
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003597 | Grad Max: 0.007211
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051129 | Grad Max: 0.051129
[GRADIENT NORM TOTAL] 4.7276

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.270
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068286  0.49317142] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 8/2040 | B: 162/1886 | C: 273/1775
[LOSS Ex1] A: 0.67971 | B: 0.68252 | C: 0.67629
[LOGITS Ex2 A] Mean Abs: 1.129 | Max: 5.141
[LOSS Ex2] A: 0.34137 | B: 0.44492 | C: 0.39036
** [JOINT LOSS] ** : 1.071722
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003104 | Grad Max: 0.092938
  -> Layer: shared_layers.0.bias | Grad Mean: 0.096159 | Grad Max: 0.508472
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.010244
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009877 | Grad Max: 0.009877
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000877 | Grad Max: 0.051578
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015950 | Grad Max: 0.296624
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000203 | Grad Max: 0.005048
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006989 | Grad Max: 0.024248
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000545
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001643 | Grad Max: 0.004315
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000242
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000607 | Grad Max: 0.001686
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001414 | Grad Max: 0.003459
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020390 | Grad Max: 0.020390
[GRADIENT NORM TOTAL] 1.9990

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.235
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50885624 0.49114373] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.012
[MASKS] A(Pass/Fail): 4/2044 | B: 179/1869 | C: 217/1831
[LOSS Ex1] A: 0.68183 | B: 0.68285 | C: 0.68013
[LOGITS Ex2 A] Mean Abs: 1.093 | Max: 4.553
[LOSS Ex2] A: 0.33116 | B: 0.43989 | C: 0.41554
** [JOINT LOSS] ** : 1.077131
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004793 | Grad Max: 0.112597
  -> Layer: shared_layers.0.bias | Grad Mean: 0.205301 | Grad Max: 1.161624
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001753 | Grad Max: 0.008372
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008306 | Grad Max: 0.008306
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001723 | Grad Max: 0.099157
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032403 | Grad Max: 0.550420
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000421 | Grad Max: 0.010334
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014826 | Grad Max: 0.053305
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000930
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003482 | Grad Max: 0.007949
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000483
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001284 | Grad Max: 0.003413
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003329 | Grad Max: 0.006582
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045001 | Grad Max: 0.045001
[GRADIENT NORM TOTAL] 4.1859

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.165
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093225  0.49067745] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.012
[MASKS] A(Pass/Fail): 7/2041 | B: 164/1884 | C: 251/1797
[LOSS Ex1] A: 0.68141 | B: 0.68085 | C: 0.67695
[LOGITS Ex2 A] Mean Abs: 1.053 | Max: 5.045
[LOSS Ex2] A: 0.34342 | B: 0.42177 | C: 0.41223
** [JOINT LOSS] ** : 1.072212
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006219 | Grad Max: 0.158590
  -> Layer: shared_layers.0.bias | Grad Mean: 0.286456 | Grad Max: 1.626035
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001941 | Grad Max: 0.008696
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003383 | Grad Max: 0.003383
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002395 | Grad Max: 0.123388
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044711 | Grad Max: 0.696445
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000580 | Grad Max: 0.014720
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020470 | Grad Max: 0.076562
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000105 | Grad Max: 0.001424
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004803 | Grad Max: 0.011289
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000055 | Grad Max: 0.000623
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001768 | Grad Max: 0.004578
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004434 | Grad Max: 0.008635
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.061091 | Grad Max: 0.061091
[GRADIENT NORM TOTAL] 5.7551

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.246
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54003197 0.45996803] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 6/2042 | B: 162/1694 | C: 238/1810
[LOSS Ex1] A: 0.68067 | B: 0.68282 | C: 0.67904
[LOGITS Ex2 A] Mean Abs: 1.108 | Max: 4.748
[LOSS Ex2] A: 0.33166 | B: 0.41671 | C: 0.41229
** [JOINT LOSS] ** : 1.067725
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003435 | Grad Max: 0.093360
  -> Layer: shared_layers.0.bias | Grad Mean: 0.156627 | Grad Max: 0.853228
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.009988
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017613 | Grad Max: 0.017613
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001298 | Grad Max: 0.076608
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024074 | Grad Max: 0.419716
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.007940
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011088 | Grad Max: 0.041023
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000831
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002600 | Grad Max: 0.006928
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000363
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000950 | Grad Max: 0.002446
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002325 | Grad Max: 0.004844
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031778 | Grad Max: 0.031778
[GRADIENT NORM TOTAL] 3.1529

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.278
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.504959 0.495041] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 3/2045 | B: 166/1882 | C: 234/1814
[LOSS Ex1] A: 0.68234 | B: 0.68244 | C: 0.67786
[LOGITS Ex2 A] Mean Abs: 1.145 | Max: 4.664
[LOSS Ex2] A: 0.33370 | B: 0.45170 | C: 0.40808
** [JOINT LOSS] ** : 1.078710
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002645 | Grad Max: 0.059407
  -> Layer: shared_layers.0.bias | Grad Mean: 0.154051 | Grad Max: 0.878194
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001860 | Grad Max: 0.008717
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010361 | Grad Max: 0.010361
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001193 | Grad Max: 0.060841
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022955 | Grad Max: 0.327884
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000306 | Grad Max: 0.006861
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010970 | Grad Max: 0.038012
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000722
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002528 | Grad Max: 0.006021
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000323
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000924 | Grad Max: 0.002381
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002085 | Grad Max: 0.004199
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031054 | Grad Max: 0.031054
[GRADIENT NORM TOTAL] 3.0497

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.111
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5394925  0.46050745] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 4/2044 | B: 181/1867 | C: 159/1217
[LOSS Ex1] A: 0.68122 | B: 0.68277 | C: 0.67853
[LOGITS Ex2 A] Mean Abs: 1.169 | Max: 4.714
[LOSS Ex2] A: 0.35059 | B: 0.44726 | C: 0.38413
** [JOINT LOSS] ** : 1.074835
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004057 | Grad Max: 0.087677
  -> Layer: shared_layers.0.bias | Grad Mean: 0.215651 | Grad Max: 1.206745
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001806 | Grad Max: 0.007807
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003990 | Grad Max: 0.003990
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001694 | Grad Max: 0.109191
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032105 | Grad Max: 0.622403
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.008826
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014447 | Grad Max: 0.048421
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000881
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003355 | Grad Max: 0.007556
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000527
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001233 | Grad Max: 0.003635
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002915 | Grad Max: 0.006144
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042071 | Grad Max: 0.042071
[GRADIENT NORM TOTAL] 4.3702

[EPOCH SUMMARY] Train Loss: 1.0765

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0466 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0547 -> New: 1.0466)

############################## EPOCH 26/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.154
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5222757  0.47772428] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 3/1613 | B: 168/1880 | C: 232/1816
[LOSS Ex1] A: 0.68043 | B: 0.68077 | C: 0.67942
[LOGITS Ex2 A] Mean Abs: 1.179 | Max: 4.736
[LOSS Ex2] A: 0.31987 | B: 0.42206 | C: 0.41540
** [JOINT LOSS] ** : 1.065983
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002794 | Grad Max: 0.069399
  -> Layer: shared_layers.0.bias | Grad Mean: 0.147425 | Grad Max: 0.823084
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001969 | Grad Max: 0.009595
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010339 | Grad Max: 0.010339
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001128 | Grad Max: 0.063463
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021574 | Grad Max: 0.361216
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000270 | Grad Max: 0.006104
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009689 | Grad Max: 0.032068
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000611
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002250 | Grad Max: 0.005465
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000351
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000827 | Grad Max: 0.002445
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001874 | Grad Max: 0.004266
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028363 | Grad Max: 0.028363
[GRADIENT NORM TOTAL] 2.9036

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.279
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5089412  0.49105883] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 3/2045 | B: 163/1693 | C: 223/1825
[LOSS Ex1] A: 0.68078 | B: 0.68274 | C: 0.67913
[LOGITS Ex2 A] Mean Abs: 1.127 | Max: 4.959
[LOSS Ex2] A: 0.33406 | B: 0.42404 | C: 0.40502
** [JOINT LOSS] ** : 1.068590
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003014 | Grad Max: 0.074776
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141641 | Grad Max: 0.781797
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.009421
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012508 | Grad Max: 0.012508
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001125 | Grad Max: 0.070009
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020960 | Grad Max: 0.395921
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000279 | Grad Max: 0.007395
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009915 | Grad Max: 0.039264
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000809
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002316 | Grad Max: 0.006279
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000344
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000844 | Grad Max: 0.002218
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001999 | Grad Max: 0.004568
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028150 | Grad Max: 0.028150
[GRADIENT NORM TOTAL] 2.8408

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.266
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008837  0.49911627] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 5/2043 | B: 168/1880 | C: 258/1790
[LOSS Ex1] A: 0.68039 | B: 0.68236 | C: 0.67770
[LOGITS Ex2 A] Mean Abs: 1.105 | Max: 4.665
[LOSS Ex2] A: 0.33782 | B: 0.45862 | C: 0.38589
** [JOINT LOSS] ** : 1.074261
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002964 | Grad Max: 0.099836
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207623 | Grad Max: 1.178064
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002046 | Grad Max: 0.010047
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015443 | Grad Max: 0.015443
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001602 | Grad Max: 0.104481
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030230 | Grad Max: 0.598527
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000398 | Grad Max: 0.009145
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014243 | Grad Max: 0.051371
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000951
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003319 | Grad Max: 0.008358
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000451
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001221 | Grad Max: 0.003261
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002996 | Grad Max: 0.005787
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041970 | Grad Max: 0.041970
[GRADIENT NORM TOTAL] 4.2206

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.273
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067142  0.49328578] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 9/2039 | B: 181/1867 | C: 293/1755
[LOSS Ex1] A: 0.67951 | B: 0.68269 | C: 0.67415
[LOGITS Ex2 A] Mean Abs: 1.119 | Max: 5.659
[LOSS Ex2] A: 0.33860 | B: 0.42963 | C: 0.39251
** [JOINT LOSS] ** : 1.065698
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.050281
  -> Layer: shared_layers.0.bias | Grad Mean: 0.079348 | Grad Max: 0.454141
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.009495
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004054 | Grad Max: 0.004054
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000695 | Grad Max: 0.050350
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011855 | Grad Max: 0.280168
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000136 | Grad Max: 0.004466
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004770 | Grad Max: 0.021603
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000424
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001087 | Grad Max: 0.002955
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000194
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000401 | Grad Max: 0.001210
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001092 | Grad Max: 0.002921
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014985 | Grad Max: 0.014985
[GRADIENT NORM TOTAL] 1.6347

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.237
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50877744 0.49122256] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.513 | Std: 0.012
[MASKS] A(Pass/Fail): 4/2044 | B: 168/1880 | C: 231/1817
[LOSS Ex1] A: 0.68167 | B: 0.68068 | C: 0.67787
[LOGITS Ex2 A] Mean Abs: 1.151 | Max: 5.028
[LOSS Ex2] A: 0.33775 | B: 0.42573 | C: 0.40620
** [JOINT LOSS] ** : 1.069965
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003834 | Grad Max: 0.093984
  -> Layer: shared_layers.0.bias | Grad Mean: 0.196621 | Grad Max: 1.129699
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.007691
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001742 | Grad Max: 0.001742
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001578 | Grad Max: 0.090020
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029790 | Grad Max: 0.511899
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.008889
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013790 | Grad Max: 0.048233
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000926
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003205 | Grad Max: 0.007400
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000440
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001174 | Grad Max: 0.003003
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002706 | Grad Max: 0.005721
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039634 | Grad Max: 0.039634
[GRADIENT NORM TOTAL] 3.9795

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.167
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50959677 0.49040323] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.513 | Std: 0.012
[MASKS] A(Pass/Fail): 8/2040 | B: 163/1693 | C: 230/1818
[LOSS Ex1] A: 0.68125 | B: 0.68266 | C: 0.67860
[LOGITS Ex2 A] Mean Abs: 1.114 | Max: 4.910
[LOSS Ex2] A: 0.34906 | B: 0.42442 | C: 0.41097
** [JOINT LOSS] ** : 1.075652
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005779 | Grad Max: 0.124677
  -> Layer: shared_layers.0.bias | Grad Mean: 0.240921 | Grad Max: 1.360271
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001806 | Grad Max: 0.008999
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007298 | Grad Max: 0.007298
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001990 | Grad Max: 0.114190
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037368 | Grad Max: 0.627017
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000484 | Grad Max: 0.011616
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017072 | Grad Max: 0.058067
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.001168
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003975 | Grad Max: 0.010223
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000045 | Grad Max: 0.000538
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001465 | Grad Max: 0.003842
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003647 | Grad Max: 0.007000
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051487 | Grad Max: 0.051487
[GRADIENT NORM TOTAL] 4.8180

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.249
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54086053 0.45913947] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 6/2042 | B: 169/1879 | C: 241/1807
[LOSS Ex1] A: 0.68047 | B: 0.68228 | C: 0.67830
[LOGITS Ex2 A] Mean Abs: 1.133 | Max: 4.978
[LOSS Ex2] A: 0.32888 | B: 0.44624 | C: 0.40803
** [JOINT LOSS] ** : 1.074732
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003039 | Grad Max: 0.082605
  -> Layer: shared_layers.0.bias | Grad Mean: 0.111221 | Grad Max: 0.578430
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.009809
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016003 | Grad Max: 0.016003
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000907 | Grad Max: 0.055410
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016762 | Grad Max: 0.298375
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000220 | Grad Max: 0.006533
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007764 | Grad Max: 0.030210
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000575
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001803 | Grad Max: 0.004998
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000283
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000659 | Grad Max: 0.001855
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001501 | Grad Max: 0.003785
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022341 | Grad Max: 0.022341
[GRADIENT NORM TOTAL] 2.2171

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.281
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5048996  0.49510044] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 3/2045 | B: 181/1867 | C: 241/1807
[LOSS Ex1] A: 0.68217 | B: 0.68261 | C: 0.67725
[LOGITS Ex2 A] Mean Abs: 1.109 | Max: 4.642
[LOSS Ex2] A: 0.33004 | B: 0.42629 | C: 0.41351
** [JOINT LOSS] ** : 1.070621
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006898 | Grad Max: 0.184518
  -> Layer: shared_layers.0.bias | Grad Mean: 0.232190 | Grad Max: 1.348892
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001797 | Grad Max: 0.008291
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004338 | Grad Max: 0.004338
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.141033
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037615 | Grad Max: 0.736018
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000474 | Grad Max: 0.011104
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016737 | Grad Max: 0.061166
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001194
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003942 | Grad Max: 0.010395
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000577
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001441 | Grad Max: 0.003773
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003555 | Grad Max: 0.006836
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.049491 | Grad Max: 0.049491
[GRADIENT NORM TOTAL] 4.7316

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.113
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54026365 0.45973638] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 4/2044 | B: 168/1880 | C: 228/1820
[LOSS Ex1] A: 0.68102 | B: 0.68059 | C: 0.67952
[LOGITS Ex2 A] Mean Abs: 1.115 | Max: 5.139
[LOSS Ex2] A: 0.36052 | B: 0.43536 | C: 0.42603
** [JOINT LOSS] ** : 1.087679
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007674 | Grad Max: 0.209075
  -> Layer: shared_layers.0.bias | Grad Mean: 0.316458 | Grad Max: 1.701007
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001883 | Grad Max: 0.009175
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007063 | Grad Max: 0.007063
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002564 | Grad Max: 0.139681
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048443 | Grad Max: 0.733838
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000630 | Grad Max: 0.014286
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022265 | Grad Max: 0.075175
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001430
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005204 | Grad Max: 0.011833
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000059 | Grad Max: 0.000724
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001917 | Grad Max: 0.004871
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004753 | Grad Max: 0.008933
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.066770 | Grad Max: 0.066770
[GRADIENT NORM TOTAL] 6.1865

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.156
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52283317 0.4771669 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 3/1613 | B: 164/1692 | C: 240/1808
[LOSS Ex1] A: 0.68023 | B: 0.68257 | C: 0.67863
[LOGITS Ex2 A] Mean Abs: 1.154 | Max: 4.757
[LOSS Ex2] A: 0.32360 | B: 0.41532 | C: 0.41223
** [JOINT LOSS] ** : 1.064195
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005550 | Grad Max: 0.154371
  -> Layer: shared_layers.0.bias | Grad Mean: 0.223447 | Grad Max: 1.171280
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001887 | Grad Max: 0.009523
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008248 | Grad Max: 0.008248
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001841 | Grad Max: 0.100727
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034125 | Grad Max: 0.538353
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000434 | Grad Max: 0.010027
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015311 | Grad Max: 0.051062
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000962
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003604 | Grad Max: 0.008134
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000499
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001319 | Grad Max: 0.003677
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003374 | Grad Max: 0.006922
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045826 | Grad Max: 0.045826
[GRADIENT NORM TOTAL] 4.4147

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.282
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50891906 0.49108094] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 3/2045 | B: 170/1878 | C: 212/1836
[LOSS Ex1] A: 0.68058 | B: 0.68219 | C: 0.68051
[LOGITS Ex2 A] Mean Abs: 1.172 | Max: 5.579
[LOSS Ex2] A: 0.34019 | B: 0.44472 | C: 0.42590
** [JOINT LOSS] ** : 1.084698
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.044449
  -> Layer: shared_layers.0.bias | Grad Mean: 0.116249 | Grad Max: 0.713034
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001923 | Grad Max: 0.009627
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015035 | Grad Max: 0.015035
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000911 | Grad Max: 0.084594
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017137 | Grad Max: 0.465588
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000212 | Grad Max: 0.005276
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007598 | Grad Max: 0.026091
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000523
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001718 | Grad Max: 0.004148
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000244
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000622 | Grad Max: 0.001666
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001336 | Grad Max: 0.002969
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020879 | Grad Max: 0.020879
[GRADIENT NORM TOTAL] 2.4080

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.269
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007744  0.49922562] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 5/2043 | B: 182/1866 | C: 247/1801
[LOSS Ex1] A: 0.68019 | B: 0.68252 | C: 0.67869
[LOGITS Ex2 A] Mean Abs: 1.171 | Max: 4.494
[LOSS Ex2] A: 0.34103 | B: 0.44336 | C: 0.39295
** [JOINT LOSS] ** : 1.072915
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004463 | Grad Max: 0.113411
  -> Layer: shared_layers.0.bias | Grad Mean: 0.217494 | Grad Max: 1.301247
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.010451
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019571 | Grad Max: 0.019571
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001764 | Grad Max: 0.107975
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033429 | Grad Max: 0.611692
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000423 | Grad Max: 0.011086
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015087 | Grad Max: 0.056345
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000898
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003502 | Grad Max: 0.007855
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000462
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001287 | Grad Max: 0.003497
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002987 | Grad Max: 0.006420
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043772 | Grad Max: 0.043772
[GRADIENT NORM TOTAL] 4.4461

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.275
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50662386 0.4933761 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 10/2038 | B: 173/1875 | C: 252/1796
[LOSS Ex1] A: 0.67930 | B: 0.68050 | C: 0.67670
[LOGITS Ex2 A] Mean Abs: 1.155 | Max: 5.816
[LOSS Ex2] A: 0.34571 | B: 0.41956 | C: 0.39320
** [JOINT LOSS] ** : 1.064989
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003437 | Grad Max: 0.077904
  -> Layer: shared_layers.0.bias | Grad Mean: 0.093541 | Grad Max: 0.514497
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.009913
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007770 | Grad Max: 0.007770
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000855 | Grad Max: 0.053902
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015670 | Grad Max: 0.305307
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000193 | Grad Max: 0.004575
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006776 | Grad Max: 0.020363
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000524
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001592 | Grad Max: 0.004377
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000253
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000585 | Grad Max: 0.001682
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001333 | Grad Max: 0.003443
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019792 | Grad Max: 0.019792
[GRADIENT NORM TOTAL] 1.9453

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.238
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50872445 0.49127558] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 4/2044 | B: 164/1692 | C: 157/1219
[LOSS Ex1] A: 0.68151 | B: 0.68249 | C: 0.67950
[LOGITS Ex2 A] Mean Abs: 1.095 | Max: 5.129
[LOSS Ex2] A: 0.33554 | B: 0.41415 | C: 0.40115
** [JOINT LOSS] ** : 1.064775
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004687 | Grad Max: 0.121150
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207749 | Grad Max: 1.165584
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001766 | Grad Max: 0.008440
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006534 | Grad Max: 0.006534
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001700 | Grad Max: 0.090499
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032173 | Grad Max: 0.509344
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000412 | Grad Max: 0.010148
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014671 | Grad Max: 0.053669
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000870
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003416 | Grad Max: 0.007992
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000464
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001240 | Grad Max: 0.003267
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002983 | Grad Max: 0.006202
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042074 | Grad Max: 0.042074
[GRADIENT NORM TOTAL] 4.2033

[EPOCH SUMMARY] Train Loss: 1.0718

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0508 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 27/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.168
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50987583 0.4901242 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.012
[MASKS] A(Pass/Fail): 10/2038 | B: 172/1876 | C: 254/1794
[LOSS Ex1] A: 0.68109 | B: 0.68211 | C: 0.67703
[LOGITS Ex2 A] Mean Abs: 1.061 | Max: 4.979
[LOSS Ex2] A: 0.33835 | B: 0.45066 | C: 0.41134
** [JOINT LOSS] ** : 1.080189
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005637 | Grad Max: 0.141784
  -> Layer: shared_layers.0.bias | Grad Mean: 0.277865 | Grad Max: 1.576257
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001881 | Grad Max: 0.009068
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006187 | Grad Max: 0.006187
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.129354
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042604 | Grad Max: 0.717589
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000547 | Grad Max: 0.013792
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019487 | Grad Max: 0.066817
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001230
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004536 | Grad Max: 0.011031
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000575
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001665 | Grad Max: 0.004123
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004113 | Grad Max: 0.007804
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057496 | Grad Max: 0.057496
[GRADIENT NORM TOTAL] 5.6219

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.251
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5417202  0.45827976] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 6/2042 | B: 182/1866 | C: 245/1803
[LOSS Ex1] A: 0.68027 | B: 0.68244 | C: 0.67842
[LOGITS Ex2 A] Mean Abs: 1.119 | Max: 4.913
[LOSS Ex2] A: 0.31206 | B: 0.43696 | C: 0.40551
** [JOINT LOSS] ** : 1.065219
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002771 | Grad Max: 0.067635
  -> Layer: shared_layers.0.bias | Grad Mean: 0.107865 | Grad Max: 0.640462
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001934 | Grad Max: 0.009344
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013410 | Grad Max: 0.013410
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000951 | Grad Max: 0.069632
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018121 | Grad Max: 0.385997
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.005340
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008289 | Grad Max: 0.029245
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000565
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001929 | Grad Max: 0.004814
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000299
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000707 | Grad Max: 0.002007
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001774 | Grad Max: 0.004197
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024497 | Grad Max: 0.024497
[GRADIENT NORM TOTAL] 2.2718

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.283
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5047949 0.4952051] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 4/2044 | B: 173/1875 | C: 242/1806
[LOSS Ex1] A: 0.68201 | B: 0.68041 | C: 0.67855
[LOGITS Ex2 A] Mean Abs: 1.157 | Max: 4.777
[LOSS Ex2] A: 0.33942 | B: 0.42848 | C: 0.40490
** [JOINT LOSS] ** : 1.071254
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004375 | Grad Max: 0.103616
  -> Layer: shared_layers.0.bias | Grad Mean: 0.200073 | Grad Max: 1.124629
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001884 | Grad Max: 0.008871
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010080 | Grad Max: 0.010080
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001624 | Grad Max: 0.092447
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030964 | Grad Max: 0.520041
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000409 | Grad Max: 0.009812
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014680 | Grad Max: 0.052220
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000884
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003390 | Grad Max: 0.007465
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000443
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001236 | Grad Max: 0.003278
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002918 | Grad Max: 0.005823
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042163 | Grad Max: 0.042163
[GRADIENT NORM TOTAL] 4.0637

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.114
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409866 0.4590134] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 6/2042 | B: 164/1692 | C: 256/1792
[LOSS Ex1] A: 0.68086 | B: 0.68240 | C: 0.67662
[LOGITS Ex2 A] Mean Abs: 1.198 | Max: 5.065
[LOSS Ex2] A: 0.34927 | B: 0.42088 | C: 0.39175
** [JOINT LOSS] ** : 1.067259
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005709 | Grad Max: 0.129943
  -> Layer: shared_layers.0.bias | Grad Mean: 0.264654 | Grad Max: 1.455758
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001888 | Grad Max: 0.008819
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003805 | Grad Max: 0.003805
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.130038
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039902 | Grad Max: 0.725790
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000512 | Grad Max: 0.012422
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018313 | Grad Max: 0.063152
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001195
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004261 | Grad Max: 0.010427
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000575
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001559 | Grad Max: 0.004131
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003746 | Grad Max: 0.006913
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053628 | Grad Max: 0.053628
[GRADIENT NORM TOTAL] 5.2881

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.158
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5233412  0.47665882] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 4/1612 | B: 173/1875 | C: 229/1819
[LOSS Ex1] A: 0.68006 | B: 0.68202 | C: 0.67906
[LOGITS Ex2 A] Mean Abs: 1.193 | Max: 4.902
[LOSS Ex2] A: 0.32425 | B: 0.44516 | C: 0.39063
** [JOINT LOSS] ** : 1.067062
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004015 | Grad Max: 0.096731
  -> Layer: shared_layers.0.bias | Grad Mean: 0.170559 | Grad Max: 0.961364
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.010011
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013301 | Grad Max: 0.013301
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001394 | Grad Max: 0.073601
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026693 | Grad Max: 0.409170
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000341 | Grad Max: 0.010517
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012285 | Grad Max: 0.047445
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000751
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002843 | Grad Max: 0.006685
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000380
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001044 | Grad Max: 0.002733
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002388 | Grad Max: 0.005043
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035951 | Grad Max: 0.035951
[GRADIENT NORM TOTAL] 3.4539

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.284
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5088399  0.49116012] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 3/2045 | B: 184/1864 | C: 251/1797
[LOSS Ex1] A: 0.68040 | B: 0.68236 | C: 0.67716
[LOGITS Ex2 A] Mean Abs: 1.139 | Max: 5.975
[LOSS Ex2] A: 0.33513 | B: 0.43080 | C: 0.39491
** [JOINT LOSS] ** : 1.066920
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002912 | Grad Max: 0.074218
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134808 | Grad Max: 0.775246
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001892 | Grad Max: 0.008946
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007361 | Grad Max: 0.007361
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001092 | Grad Max: 0.076937
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020505 | Grad Max: 0.432948
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000262 | Grad Max: 0.007671
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009363 | Grad Max: 0.036072
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000650
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002180 | Grad Max: 0.005046
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000288
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000793 | Grad Max: 0.002162
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001964 | Grad Max: 0.004524
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027330 | Grad Max: 0.027330
[GRADIENT NORM TOTAL] 2.7620

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.271
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50060904 0.4993909 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 6/2042 | B: 175/1873 | C: 244/1804
[LOSS Ex1] A: 0.67999 | B: 0.68032 | C: 0.67805
[LOGITS Ex2 A] Mean Abs: 1.130 | Max: 4.663
[LOSS Ex2] A: 0.34104 | B: 0.42685 | C: 0.42305
** [JOINT LOSS] ** : 1.076432
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004062 | Grad Max: 0.099994
  -> Layer: shared_layers.0.bias | Grad Mean: 0.217314 | Grad Max: 1.220994
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002021 | Grad Max: 0.010039
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010607 | Grad Max: 0.010607
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001713 | Grad Max: 0.106623
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032218 | Grad Max: 0.604028
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000415 | Grad Max: 0.009433
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014887 | Grad Max: 0.050978
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.001004
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003457 | Grad Max: 0.009137
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000407
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001265 | Grad Max: 0.003202
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003064 | Grad Max: 0.005562
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043052 | Grad Max: 0.043052
[GRADIENT NORM TOTAL] 4.3812

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.277
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50649977 0.49350023] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 11/2037 | B: 164/1692 | C: 222/1826
[LOSS Ex1] A: 0.67910 | B: 0.68232 | C: 0.67854
[LOGITS Ex2 A] Mean Abs: 1.136 | Max: 5.058
[LOSS Ex2] A: 0.32897 | B: 0.41481 | C: 0.43137
** [JOINT LOSS] ** : 1.071707
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001924 | Grad Max: 0.048165
  -> Layer: shared_layers.0.bias | Grad Mean: 0.090937 | Grad Max: 0.533452
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.009730
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008116 | Grad Max: 0.008116
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000771 | Grad Max: 0.046267
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013571 | Grad Max: 0.258440
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000173 | Grad Max: 0.005542
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006166 | Grad Max: 0.025590
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000511
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001408 | Grad Max: 0.003717
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000221
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000513 | Grad Max: 0.001379
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001286 | Grad Max: 0.003369
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018212 | Grad Max: 0.018212
[GRADIENT NORM TOTAL] 1.9061

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.240
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.508641   0.49135903] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 4/2044 | B: 173/1875 | C: 211/1837
[LOSS Ex1] A: 0.68135 | B: 0.68194 | C: 0.67906
[LOGITS Ex2 A] Mean Abs: 1.154 | Max: 5.050
[LOSS Ex2] A: 0.33550 | B: 0.46291 | C: 0.38133
** [JOINT LOSS] ** : 1.074035
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004003 | Grad Max: 0.094192
  -> Layer: shared_layers.0.bias | Grad Mean: 0.204169 | Grad Max: 1.140155
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001761 | Grad Max: 0.008040
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005614 | Grad Max: 0.005614
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001590 | Grad Max: 0.096511
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030237 | Grad Max: 0.513325
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.010310
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013834 | Grad Max: 0.054223
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000813
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003195 | Grad Max: 0.007197
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000434
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001166 | Grad Max: 0.002976
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002750 | Grad Max: 0.006385
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040376 | Grad Max: 0.040376
[GRADIENT NORM TOTAL] 4.0545

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.169
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5101792  0.48982075] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 11/2037 | B: 185/1863 | C: 239/1809
[LOSS Ex1] A: 0.68093 | B: 0.68228 | C: 0.67897
[LOGITS Ex2 A] Mean Abs: 1.138 | Max: 4.614
[LOSS Ex2] A: 0.33844 | B: 0.44249 | C: 0.38726
** [JOINT LOSS] ** : 1.070127
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004404 | Grad Max: 0.108500
  -> Layer: shared_layers.0.bias | Grad Mean: 0.251120 | Grad Max: 1.430617
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001834 | Grad Max: 0.009010
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008466 | Grad Max: 0.008466
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001954 | Grad Max: 0.116662
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037311 | Grad Max: 0.651976
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000485 | Grad Max: 0.011306
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017445 | Grad Max: 0.062896
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001115
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004037 | Grad Max: 0.008983
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000541
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001484 | Grad Max: 0.004060
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003441 | Grad Max: 0.007188
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050738 | Grad Max: 0.050738
[GRADIENT NORM TOTAL] 5.0619

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.253
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54257405 0.45742592] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.013
[MASKS] A(Pass/Fail): 6/2042 | B: 175/1873 | C: 231/1817
[LOSS Ex1] A: 0.68007 | B: 0.68023 | C: 0.67846
[LOGITS Ex2 A] Mean Abs: 1.142 | Max: 4.906
[LOSS Ex2] A: 0.32518 | B: 0.42297 | C: 0.40858
** [JOINT LOSS] ** : 1.065165
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002194 | Grad Max: 0.051005
  -> Layer: shared_layers.0.bias | Grad Mean: 0.131753 | Grad Max: 0.783788
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001982 | Grad Max: 0.009502
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011641 | Grad Max: 0.011641
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001030 | Grad Max: 0.071251
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019390 | Grad Max: 0.405110
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000253 | Grad Max: 0.006554
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009142 | Grad Max: 0.034882
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000589
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002113 | Grad Max: 0.005066
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000310
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000776 | Grad Max: 0.002129
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001757 | Grad Max: 0.004363
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026461 | Grad Max: 0.026461
[GRADIENT NORM TOTAL] 2.7437

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.286
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50471705 0.49528295] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 5/2043 | B: 165/1691 | C: 275/1773
[LOSS Ex1] A: 0.68184 | B: 0.68224 | C: 0.67511
[LOGITS Ex2 A] Mean Abs: 1.123 | Max: 5.052
[LOSS Ex2] A: 0.33360 | B: 0.41755 | C: 0.39027
** [JOINT LOSS] ** : 1.060204
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005997 | Grad Max: 0.167012
  -> Layer: shared_layers.0.bias | Grad Mean: 0.179769 | Grad Max: 0.933191
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001854 | Grad Max: 0.008602
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007307 | Grad Max: 0.007307
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001574 | Grad Max: 0.095344
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028494 | Grad Max: 0.482856
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.007773
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012490 | Grad Max: 0.041355
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000794
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002962 | Grad Max: 0.006787
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000428
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001070 | Grad Max: 0.002785
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002689 | Grad Max: 0.005348
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036136 | Grad Max: 0.036136
[GRADIENT NORM TOTAL] 3.5955

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.116
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54175127 0.45824873] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 6/2042 | B: 173/1875 | C: 232/1816
[LOSS Ex1] A: 0.68068 | B: 0.68186 | C: 0.67845
[LOGITS Ex2 A] Mean Abs: 1.117 | Max: 5.120
[LOSS Ex2] A: 0.33507 | B: 0.44387 | C: 0.41628
** [JOINT LOSS] ** : 1.078735
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005736 | Grad Max: 0.144012
  -> Layer: shared_layers.0.bias | Grad Mean: 0.246662 | Grad Max: 1.330100
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001902 | Grad Max: 0.009013
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009992 | Grad Max: 0.009992
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.117875
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037865 | Grad Max: 0.654216
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000485 | Grad Max: 0.010034
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017295 | Grad Max: 0.056839
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001152
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004034 | Grad Max: 0.009263
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000587
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001472 | Grad Max: 0.003844
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003611 | Grad Max: 0.006741
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050324 | Grad Max: 0.050324
[GRADIENT NORM TOTAL] 4.9377

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.161
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5238829  0.47611713] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.013
[MASKS] A(Pass/Fail): 6/1610 | B: 185/1863 | C: 174/1202
[LOSS Ex1] A: 0.67987 | B: 0.68219 | C: 0.67796
[LOGITS Ex2 A] Mean Abs: 1.153 | Max: 4.635
[LOSS Ex2] A: 0.31776 | B: 0.43434 | C: 0.38981
** [JOINT LOSS] ** : 1.060645
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002971 | Grad Max: 0.079283
  -> Layer: shared_layers.0.bias | Grad Mean: 0.106956 | Grad Max: 0.581226
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001949 | Grad Max: 0.009745
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011892 | Grad Max: 0.011892
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000930 | Grad Max: 0.059576
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016884 | Grad Max: 0.327373
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000217 | Grad Max: 0.005668
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007647 | Grad Max: 0.026544
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000551
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001812 | Grad Max: 0.004557
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000317
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000665 | Grad Max: 0.001948
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001780 | Grad Max: 0.003952
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024057 | Grad Max: 0.024057
[GRADIENT NORM TOTAL] 2.2030

[EPOCH SUMMARY] Train Loss: 1.0696

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0436 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0466 -> New: 1.0436)

############################## EPOCH 28/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.287
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5087953  0.49120468] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 3/2045 | B: 177/1871 | C: 237/1811
[LOSS Ex1] A: 0.68022 | B: 0.68014 | C: 0.67862
[LOGITS Ex2 A] Mean Abs: 1.189 | Max: 5.127
[LOSS Ex2] A: 0.32237 | B: 0.41906 | C: 0.40774
** [JOINT LOSS] ** : 1.062714
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003888 | Grad Max: 0.096682
  -> Layer: shared_layers.0.bias | Grad Mean: 0.213532 | Grad Max: 1.253441
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001945 | Grad Max: 0.009294
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010937 | Grad Max: 0.010937
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001708 | Grad Max: 0.099755
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032354 | Grad Max: 0.567617
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.010112
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014647 | Grad Max: 0.052557
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.001022
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003394 | Grad Max: 0.008775
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000443
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001234 | Grad Max: 0.003349
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002893 | Grad Max: 0.005803
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042389 | Grad Max: 0.042389
[GRADIENT NORM TOTAL] 4.3385

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.273
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50046057 0.4995394 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 6/2042 | B: 165/1691 | C: 236/1812
[LOSS Ex1] A: 0.67979 | B: 0.68215 | C: 0.67871
[LOGITS Ex2 A] Mean Abs: 1.188 | Max: 4.765
[LOSS Ex2] A: 0.33251 | B: 0.42316 | C: 0.40627
** [JOINT LOSS] ** : 1.067534
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006666 | Grad Max: 0.149986
  -> Layer: shared_layers.0.bias | Grad Mean: 0.292075 | Grad Max: 1.710276
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001986 | Grad Max: 0.009873
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014084 | Grad Max: 0.014084
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002427 | Grad Max: 0.132277
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045689 | Grad Max: 0.720080
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000574 | Grad Max: 0.012746
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020476 | Grad Max: 0.068145
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001371
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004747 | Grad Max: 0.011397
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000610
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001730 | Grad Max: 0.004600
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004160 | Grad Max: 0.008174
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.059744 | Grad Max: 0.059744
[GRADIENT NORM TOTAL] 5.8828

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.279
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063853 0.4936147] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 11/2037 | B: 173/1875 | C: 234/1814
[LOSS Ex1] A: 0.67890 | B: 0.68177 | C: 0.67825
[LOGITS Ex2 A] Mean Abs: 1.170 | Max: 5.309
[LOSS Ex2] A: 0.34393 | B: 0.44673 | C: 0.37232
** [JOINT LOSS] ** : 1.067298
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005107 | Grad Max: 0.151228
  -> Layer: shared_layers.0.bias | Grad Mean: 0.163154 | Grad Max: 0.863652
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001991 | Grad Max: 0.009780
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009629 | Grad Max: 0.009629
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001384 | Grad Max: 0.082603
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025477 | Grad Max: 0.414629
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.006467
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011029 | Grad Max: 0.033880
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000777
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002597 | Grad Max: 0.006553
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000389
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000953 | Grad Max: 0.002707
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002232 | Grad Max: 0.004785
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032653 | Grad Max: 0.032653
[GRADIENT NORM TOTAL] 3.2410

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.242
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085627  0.49143732] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 5/2043 | B: 188/1860 | C: 240/1808
[LOSS Ex1] A: 0.68120 | B: 0.68211 | C: 0.67800
[LOGITS Ex2 A] Mean Abs: 1.113 | Max: 4.852
[LOSS Ex2] A: 0.31846 | B: 0.43517 | C: 0.41645
** [JOINT LOSS] ** : 1.070464
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.061402
  -> Layer: shared_layers.0.bias | Grad Mean: 0.114148 | Grad Max: 0.637683
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001802 | Grad Max: 0.008439
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007440 | Grad Max: 0.007440
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000908 | Grad Max: 0.052279
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016940 | Grad Max: 0.289205
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000219 | Grad Max: 0.006373
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007931 | Grad Max: 0.032347
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000546
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001835 | Grad Max: 0.004332
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000278
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000668 | Grad Max: 0.001908
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001779 | Grad Max: 0.004188
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024083 | Grad Max: 0.024083
[GRADIENT NORM TOTAL] 2.3024

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.170
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51047885 0.48952112] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 12/2036 | B: 177/1871 | C: 229/1819
[LOSS Ex1] A: 0.68077 | B: 0.68004 | C: 0.67853
[LOGITS Ex2 A] Mean Abs: 1.077 | Max: 5.308
[LOSS Ex2] A: 0.33117 | B: 0.41601 | C: 0.40555
** [JOINT LOSS] ** : 1.064028
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003385 | Grad Max: 0.087062
  -> Layer: shared_layers.0.bias | Grad Mean: 0.190775 | Grad Max: 1.102121
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001866 | Grad Max: 0.008665
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004924 | Grad Max: 0.004924
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001492 | Grad Max: 0.088266
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028480 | Grad Max: 0.492203
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000375 | Grad Max: 0.009287
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013540 | Grad Max: 0.050835
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000803
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003132 | Grad Max: 0.007637
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000409
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001132 | Grad Max: 0.002948
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002749 | Grad Max: 0.006311
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038228 | Grad Max: 0.038228
[GRADIENT NORM TOTAL] 3.8431

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.256
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54343307 0.45656696] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.013
[MASKS] A(Pass/Fail): 8/2040 | B: 166/1690 | C: 254/1794
[LOSS Ex1] A: 0.67988 | B: 0.68207 | C: 0.67722
[LOGITS Ex2 A] Mean Abs: 1.143 | Max: 5.125
[LOSS Ex2] A: 0.32601 | B: 0.41456 | C: 0.38377
** [JOINT LOSS] ** : 1.054505
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.040447
  -> Layer: shared_layers.0.bias | Grad Mean: 0.084002 | Grad Max: 0.430196
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001991 | Grad Max: 0.010317
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015449 | Grad Max: 0.015449
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000680 | Grad Max: 0.053514
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011911 | Grad Max: 0.299942
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000134 | Grad Max: 0.004800
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004737 | Grad Max: 0.021567
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000415
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001077 | Grad Max: 0.003165
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000177
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000381 | Grad Max: 0.001225
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000861 | Grad Max: 0.002645
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012092 | Grad Max: 0.012092
[GRADIENT NORM TOTAL] 1.7562

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.288
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50460064 0.49539936] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 8/2040 | B: 173/1875 | C: 237/1811
[LOSS Ex1] A: 0.68169 | B: 0.68168 | C: 0.67814
[LOGITS Ex2 A] Mean Abs: 1.177 | Max: 4.518
[LOSS Ex2] A: 0.33050 | B: 0.45056 | C: 0.39552
** [JOINT LOSS] ** : 1.072697
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003829 | Grad Max: 0.090729
  -> Layer: shared_layers.0.bias | Grad Mean: 0.203930 | Grad Max: 1.134232
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.009250
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016457 | Grad Max: 0.016457
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001592 | Grad Max: 0.098199
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030599 | Grad Max: 0.552833
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000405 | Grad Max: 0.009619
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014633 | Grad Max: 0.052773
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000974
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003367 | Grad Max: 0.008326
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000481
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001216 | Grad Max: 0.003452
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002816 | Grad Max: 0.005718
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041341 | Grad Max: 0.041341
[GRADIENT NORM TOTAL] 4.0880

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.118
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54250455 0.45749545] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 11/2037 | B: 189/1859 | C: 236/1812
[LOSS Ex1] A: 0.68051 | B: 0.68202 | C: 0.67838
[LOGITS Ex2 A] Mean Abs: 1.179 | Max: 5.008
[LOSS Ex2] A: 0.32669 | B: 0.43986 | C: 0.37894
** [JOINT LOSS] ** : 1.062133
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003490 | Grad Max: 0.087588
  -> Layer: shared_layers.0.bias | Grad Mean: 0.223068 | Grad Max: 1.221784
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001792 | Grad Max: 0.008338
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004214 | Grad Max: 0.004214
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001711 | Grad Max: 0.110276
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032296 | Grad Max: 0.624411
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000419 | Grad Max: 0.010485
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015137 | Grad Max: 0.054231
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000989
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003502 | Grad Max: 0.008401
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000442
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001271 | Grad Max: 0.003243
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002871 | Grad Max: 0.006430
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043130 | Grad Max: 0.043130
[GRADIENT NORM TOTAL] 4.5648

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.163
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.524433   0.47556695] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.013
[MASKS] A(Pass/Fail): 7/1609 | B: 178/1870 | C: 244/1804
[LOSS Ex1] A: 0.67970 | B: 0.67995 | C: 0.67810
[LOGITS Ex2 A] Mean Abs: 1.224 | Max: 4.736
[LOSS Ex2] A: 0.30810 | B: 0.41725 | C: 0.36704
** [JOINT LOSS] ** : 1.043379
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001824 | Grad Max: 0.051746
  -> Layer: shared_layers.0.bias | Grad Mean: 0.105315 | Grad Max: 0.583130
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001984 | Grad Max: 0.009723
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009147 | Grad Max: 0.009147
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000827 | Grad Max: 0.048930
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015457 | Grad Max: 0.280295
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.006248
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007196 | Grad Max: 0.031528
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000554
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001656 | Grad Max: 0.004205
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000306
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000607 | Grad Max: 0.001931
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001154 | Grad Max: 0.003786
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019928 | Grad Max: 0.019928
[GRADIENT NORM TOTAL] 2.1507

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.289
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50870925 0.4912908 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 5/2043 | B: 169/1687 | C: 263/1785
[LOSS Ex1] A: 0.68003 | B: 0.68197 | C: 0.67571
[LOGITS Ex2 A] Mean Abs: 1.173 | Max: 5.070
[LOSS Ex2] A: 0.32614 | B: 0.41970 | C: 0.41432
** [JOINT LOSS] ** : 1.065961
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006416 | Grad Max: 0.166731
  -> Layer: shared_layers.0.bias | Grad Mean: 0.230997 | Grad Max: 1.263296
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.009452
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010968 | Grad Max: 0.010968
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001953 | Grad Max: 0.129903
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036552 | Grad Max: 0.680241
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000452 | Grad Max: 0.009645
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016107 | Grad Max: 0.053482
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.001022
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003757 | Grad Max: 0.009232
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000510
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001359 | Grad Max: 0.003636
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003324 | Grad Max: 0.006321
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046390 | Grad Max: 0.046390
[GRADIENT NORM TOTAL] 4.6730

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.275
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002699  0.49973008] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 12/2036 | B: 179/1869 | C: 250/1798
[LOSS Ex1] A: 0.67958 | B: 0.68159 | C: 0.67695
[LOGITS Ex2 A] Mean Abs: 1.159 | Max: 4.913
[LOSS Ex2] A: 0.33371 | B: 0.45441 | C: 0.43490
** [JOINT LOSS] ** : 1.087051
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008072 | Grad Max: 0.207689
  -> Layer: shared_layers.0.bias | Grad Mean: 0.337294 | Grad Max: 1.897505
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.009729
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011248 | Grad Max: 0.011248
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002810 | Grad Max: 0.164753
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053165 | Grad Max: 0.918640
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000666 | Grad Max: 0.015827
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023907 | Grad Max: 0.078456
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000116 | Grad Max: 0.001471
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005542 | Grad Max: 0.012873
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000060 | Grad Max: 0.000693
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002012 | Grad Max: 0.005078
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004925 | Grad Max: 0.010156
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.069178 | Grad Max: 0.069178
[GRADIENT NORM TOTAL] 6.8210

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.282
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062246 0.4937754] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 13/2035 | B: 194/1854 | C: 245/1803
[LOSS Ex1] A: 0.67870 | B: 0.68193 | C: 0.67587
[LOGITS Ex2 A] Mean Abs: 1.145 | Max: 5.193
[LOSS Ex2] A: 0.33039 | B: 0.43706 | C: 0.40186
** [JOINT LOSS] ** : 1.068600
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005027 | Grad Max: 0.121496
  -> Layer: shared_layers.0.bias | Grad Mean: 0.227211 | Grad Max: 1.301727
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.009497
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005922 | Grad Max: 0.005922
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001838 | Grad Max: 0.136711
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035021 | Grad Max: 0.755607
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.011709
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016200 | Grad Max: 0.058555
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001041
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003742 | Grad Max: 0.008594
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000453
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001351 | Grad Max: 0.003447
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003309 | Grad Max: 0.006329
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046854 | Grad Max: 0.046854
[GRADIENT NORM TOTAL] 4.5871

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.244
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084439 0.4915561] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 12/2036 | B: 186/1862 | C: 261/1787
[LOSS Ex1] A: 0.68104 | B: 0.67985 | C: 0.67602
[LOGITS Ex2 A] Mean Abs: 1.157 | Max: 4.583
[LOSS Ex2] A: 0.33219 | B: 0.42261 | C: 0.39711
** [JOINT LOSS] ** : 1.062940
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001809 | Grad Max: 0.050939
  -> Layer: shared_layers.0.bias | Grad Mean: 0.100730 | Grad Max: 0.577424
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001918 | Grad Max: 0.007667
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001884 | Grad Max: 0.001884
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000807 | Grad Max: 0.065569
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014680 | Grad Max: 0.378716
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.005186
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005983 | Grad Max: 0.025144
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000449
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001356 | Grad Max: 0.003604
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000211
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000486 | Grad Max: 0.001546
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000879 | Grad Max: 0.002923
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015109 | Grad Max: 0.015109
[GRADIENT NORM TOTAL] 2.1254

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.171
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108137 0.4891863] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.013
[MASKS] A(Pass/Fail): 15/2033 | B: 171/1685 | C: 149/1227
[LOSS Ex1] A: 0.68062 | B: 0.68189 | C: 0.67931
[LOGITS Ex2 A] Mean Abs: 1.147 | Max: 4.879
[LOSS Ex2] A: 0.32998 | B: 0.42412 | C: 0.39923
** [JOINT LOSS] ** : 1.065050
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004701 | Grad Max: 0.131176
  -> Layer: shared_layers.0.bias | Grad Mean: 0.219515 | Grad Max: 1.241413
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001807 | Grad Max: 0.008756
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009163 | Grad Max: 0.009163
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001683 | Grad Max: 0.093318
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032210 | Grad Max: 0.523780
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.010138
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014987 | Grad Max: 0.054274
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000858
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003452 | Grad Max: 0.007639
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000478
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001251 | Grad Max: 0.003263
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002876 | Grad Max: 0.006232
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042213 | Grad Max: 0.042213
[GRADIENT NORM TOTAL] 4.3236

[EPOCH SUMMARY] Train Loss: 1.0653

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0359 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0436 -> New: 1.0359)

############################## EPOCH 29/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54436916 0.45563087] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 15/2033 | B: 179/1869 | C: 228/1820
[LOSS Ex1] A: 0.67969 | B: 0.68150 | C: 0.67715
[LOGITS Ex2 A] Mean Abs: 1.177 | Max: 5.375
[LOSS Ex2] A: 0.31808 | B: 0.44849 | C: 0.38476
** [JOINT LOSS] ** : 1.063223
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003643 | Grad Max: 0.108500
  -> Layer: shared_layers.0.bias | Grad Mean: 0.125937 | Grad Max: 0.672440
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001991 | Grad Max: 0.009611
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012840 | Grad Max: 0.012840
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000991 | Grad Max: 0.054160
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018619 | Grad Max: 0.263913
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000239 | Grad Max: 0.006974
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008530 | Grad Max: 0.031279
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000533
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001971 | Grad Max: 0.004733
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000285
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000708 | Grad Max: 0.001948
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001528 | Grad Max: 0.003839
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023120 | Grad Max: 0.023120
[GRADIENT NORM TOTAL] 2.4632

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.291
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50448734 0.49551272] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 13/2035 | B: 194/1854 | C: 241/1807
[LOSS Ex1] A: 0.68152 | B: 0.68184 | C: 0.67793
[LOGITS Ex2 A] Mean Abs: 1.158 | Max: 4.655
[LOSS Ex2] A: 0.31664 | B: 0.43495 | C: 0.40025
** [JOINT LOSS] ** : 1.064380
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004114 | Grad Max: 0.111391
  -> Layer: shared_layers.0.bias | Grad Mean: 0.161005 | Grad Max: 0.884052
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001860 | Grad Max: 0.008737
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012415 | Grad Max: 0.012415
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001325 | Grad Max: 0.078351
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024305 | Grad Max: 0.439279
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000306 | Grad Max: 0.007152
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010994 | Grad Max: 0.036946
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000729
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002577 | Grad Max: 0.006172
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000372
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000932 | Grad Max: 0.002369
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002329 | Grad Max: 0.004955
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032440 | Grad Max: 0.032440
[GRADIENT NORM TOTAL] 3.2416

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.120
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54328895 0.45671108] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 19/2029 | B: 188/1860 | C: 236/1812
[LOSS Ex1] A: 0.68033 | B: 0.67976 | C: 0.67791
[LOGITS Ex2 A] Mean Abs: 1.138 | Max: 5.041
[LOSS Ex2] A: 0.32158 | B: 0.42311 | C: 0.42454
** [JOINT LOSS] ** : 1.069078
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004463 | Grad Max: 0.119901
  -> Layer: shared_layers.0.bias | Grad Mean: 0.256747 | Grad Max: 1.433094
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001904 | Grad Max: 0.008727
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002972 | Grad Max: 0.002972
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001984 | Grad Max: 0.198749
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037765 | Grad Max: 1.127033
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000487 | Grad Max: 0.012704
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017636 | Grad Max: 0.063235
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.001191
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004071 | Grad Max: 0.010471
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000492
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001476 | Grad Max: 0.003808
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003641 | Grad Max: 0.006708
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051086 | Grad Max: 0.051086
[GRADIENT NORM TOTAL] 5.2246

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.166
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5249537  0.47504625] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 13/1603 | B: 172/1684 | C: 240/1808
[LOSS Ex1] A: 0.67952 | B: 0.68180 | C: 0.67788
[LOGITS Ex2 A] Mean Abs: 1.193 | Max: 4.725
[LOSS Ex2] A: 0.29876 | B: 0.40459 | C: 0.40059
** [JOINT LOSS] ** : 1.047715
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001480 | Grad Max: 0.029687
  -> Layer: shared_layers.0.bias | Grad Mean: 0.083135 | Grad Max: 0.425735
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001886 | Grad Max: 0.009242
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009236 | Grad Max: 0.009236
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000670 | Grad Max: 0.058582
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012207 | Grad Max: 0.327037
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000138 | Grad Max: 0.004704
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004866 | Grad Max: 0.023616
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000427
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001115 | Grad Max: 0.003413
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000182
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000398 | Grad Max: 0.001282
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000997 | Grad Max: 0.003267
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013401 | Grad Max: 0.013401
[GRADIENT NORM TOTAL] 1.7874

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.292
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5086523  0.49134764] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 12/2036 | B: 179/1869 | C: 211/1837
[LOSS Ex1] A: 0.67985 | B: 0.68141 | C: 0.67875
[LOGITS Ex2 A] Mean Abs: 1.233 | Max: 5.114
[LOSS Ex2] A: 0.32523 | B: 0.43765 | C: 0.38441
** [JOINT LOSS] ** : 1.062436
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005789 | Grad Max: 0.160915
  -> Layer: shared_layers.0.bias | Grad Mean: 0.216411 | Grad Max: 1.109783
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001911 | Grad Max: 0.009284
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010952 | Grad Max: 0.010952
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001802 | Grad Max: 0.129340
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033611 | Grad Max: 0.693514
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.010283
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014529 | Grad Max: 0.054729
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000910
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003369 | Grad Max: 0.008053
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000456
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001215 | Grad Max: 0.003262
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002826 | Grad Max: 0.005963
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040375 | Grad Max: 0.040375
[GRADIENT NORM TOTAL] 4.2712

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.277
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50009024 0.49990976] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 15/2033 | B: 194/1854 | C: 250/1798
[LOSS Ex1] A: 0.67939 | B: 0.68175 | C: 0.67674
[LOGITS Ex2 A] Mean Abs: 1.218 | Max: 4.630
[LOSS Ex2] A: 0.32722 | B: 0.44416 | C: 0.40829
** [JOINT LOSS] ** : 1.072521
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006542 | Grad Max: 0.162992
  -> Layer: shared_layers.0.bias | Grad Mean: 0.295633 | Grad Max: 1.644493
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.010154
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017846 | Grad Max: 0.017846
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002386 | Grad Max: 0.150165
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045389 | Grad Max: 0.854643
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000564 | Grad Max: 0.012616
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020397 | Grad Max: 0.067647
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001258
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004698 | Grad Max: 0.011533
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000599
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001677 | Grad Max: 0.004309
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003835 | Grad Max: 0.007094
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.055248 | Grad Max: 0.055248
[GRADIENT NORM TOTAL] 5.9344

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.284
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50613165 0.49386832] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 16/2032 | B: 188/1860 | C: 233/1815
[LOSS Ex1] A: 0.67850 | B: 0.67966 | C: 0.67756
[LOGITS Ex2 A] Mean Abs: 1.200 | Max: 4.624
[LOSS Ex2] A: 0.33283 | B: 0.41465 | C: 0.39225
** [JOINT LOSS] ** : 1.058483
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004573 | Grad Max: 0.134973
  -> Layer: shared_layers.0.bias | Grad Mean: 0.152470 | Grad Max: 0.750317
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.009741
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010813 | Grad Max: 0.010813
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001277 | Grad Max: 0.104645
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023316 | Grad Max: 0.568391
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.006152
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010016 | Grad Max: 0.032272
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000634
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002341 | Grad Max: 0.005616
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000351
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000848 | Grad Max: 0.002383
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002114 | Grad Max: 0.004948
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029892 | Grad Max: 0.029892
[GRADIENT NORM TOTAL] 3.0159

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.246
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083878 0.4916122] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.514 | Std: 0.014
[MASKS] A(Pass/Fail): 15/2033 | B: 176/1680 | C: 258/1790
[LOSS Ex1] A: 0.68088 | B: 0.68171 | C: 0.67580
[LOGITS Ex2 A] Mean Abs: 1.139 | Max: 5.422
[LOSS Ex2] A: 0.32046 | B: 0.41752 | C: 0.39632
** [JOINT LOSS] ** : 1.057559
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005147 | Grad Max: 0.125206
  -> Layer: shared_layers.0.bias | Grad Mean: 0.246457 | Grad Max: 1.428818
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001819 | Grad Max: 0.007853
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001720 | Grad Max: 0.001720
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.125184
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037441 | Grad Max: 0.670159
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000475 | Grad Max: 0.011247
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017113 | Grad Max: 0.058921
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.001001
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003952 | Grad Max: 0.009507
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000478
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001410 | Grad Max: 0.003474
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003376 | Grad Max: 0.006877
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047413 | Grad Max: 0.047413
[GRADIENT NORM TOTAL] 4.9999

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.172
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51112145 0.48887855] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.014
[MASKS] A(Pass/Fail): 15/2033 | B: 180/1868 | C: 253/1795
[LOSS Ex1] A: 0.68045 | B: 0.68132 | C: 0.67640
[LOGITS Ex2 A] Mean Abs: 1.105 | Max: 4.765
[LOSS Ex2] A: 0.33096 | B: 0.46255 | C: 0.40630
** [JOINT LOSS] ** : 1.079325
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006605 | Grad Max: 0.161526
  -> Layer: shared_layers.0.bias | Grad Mean: 0.319038 | Grad Max: 1.826216
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001845 | Grad Max: 0.008384
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001936 | Grad Max: 0.001936
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002555 | Grad Max: 0.155922
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048268 | Grad Max: 0.843119
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000613 | Grad Max: 0.014102
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022163 | Grad Max: 0.073370
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000106 | Grad Max: 0.001338
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005127 | Grad Max: 0.012222
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.000626
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001839 | Grad Max: 0.004639
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004281 | Grad Max: 0.008147
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.061593 | Grad Max: 0.061593
[GRADIENT NORM TOTAL] 6.4003

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.261
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54525185 0.45474818] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 17/2031 | B: 196/1852 | C: 271/1777
[LOSS Ex1] A: 0.67948 | B: 0.68167 | C: 0.67614
[LOGITS Ex2 A] Mean Abs: 1.187 | Max: 5.120
[LOSS Ex2] A: 0.30519 | B: 0.44000 | C: 0.38677
** [JOINT LOSS] ** : 1.056419
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004228 | Grad Max: 0.102107
  -> Layer: shared_layers.0.bias | Grad Mean: 0.191104 | Grad Max: 1.023562
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.010523
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019189 | Grad Max: 0.019189
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001526 | Grad Max: 0.079182
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028324 | Grad Max: 0.431318
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.007777
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012935 | Grad Max: 0.042606
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000883
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002995 | Grad Max: 0.007403
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000381
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001069 | Grad Max: 0.002826
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002604 | Grad Max: 0.005221
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035890 | Grad Max: 0.035890
[GRADIENT NORM TOTAL] 3.7315

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.294
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50439596 0.495604  ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 13/2035 | B: 189/1859 | C: 253/1795
[LOSS Ex1] A: 0.68135 | B: 0.67957 | C: 0.67710
[LOGITS Ex2 A] Mean Abs: 1.185 | Max: 4.829
[LOSS Ex2] A: 0.32056 | B: 0.42588 | C: 0.39513
** [JOINT LOSS] ** : 1.059864
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.056716
  -> Layer: shared_layers.0.bias | Grad Mean: 0.130522 | Grad Max: 0.717344
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.008863
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009988 | Grad Max: 0.009988
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001017 | Grad Max: 0.066176
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018727 | Grad Max: 0.378650
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000235 | Grad Max: 0.005976
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008556 | Grad Max: 0.030969
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000567
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001921 | Grad Max: 0.004624
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000270
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000685 | Grad Max: 0.001870
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001429 | Grad Max: 0.003788
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022373 | Grad Max: 0.022373
[GRADIENT NORM TOTAL] 2.6753

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.122
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54406834 0.45593163] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 21/2027 | B: 180/1676 | C: 262/1786
[LOSS Ex1] A: 0.68015 | B: 0.68162 | C: 0.67599
[LOGITS Ex2 A] Mean Abs: 1.213 | Max: 4.855
[LOSS Ex2] A: 0.33022 | B: 0.42333 | C: 0.38936
** [JOINT LOSS] ** : 1.060225
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003793 | Grad Max: 0.096862
  -> Layer: shared_layers.0.bias | Grad Mean: 0.221442 | Grad Max: 1.237035
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001907 | Grad Max: 0.009234
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006673 | Grad Max: 0.006673
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001713 | Grad Max: 0.109065
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032465 | Grad Max: 0.606849
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000413 | Grad Max: 0.010739
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015048 | Grad Max: 0.057929
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000991
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003447 | Grad Max: 0.008527
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000437
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001231 | Grad Max: 0.003150
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002825 | Grad Max: 0.005428
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041424 | Grad Max: 0.041424
[GRADIENT NORM TOTAL] 4.4420

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.168
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52552354 0.4744764 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 14/1602 | B: 182/1866 | C: 243/1805
[LOSS Ex1] A: 0.67933 | B: 0.68124 | C: 0.67842
[LOGITS Ex2 A] Mean Abs: 1.227 | Max: 5.160
[LOSS Ex2] A: 0.29726 | B: 0.44495 | C: 0.39013
** [JOINT LOSS] ** : 1.057107
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001610 | Grad Max: 0.041090
  -> Layer: shared_layers.0.bias | Grad Mean: 0.084524 | Grad Max: 0.429748
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001854 | Grad Max: 0.008803
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003489 | Grad Max: 0.003489
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000645 | Grad Max: 0.053934
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011803 | Grad Max: 0.306701
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000143 | Grad Max: 0.004186
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005176 | Grad Max: 0.020619
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000364
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001166 | Grad Max: 0.003243
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000198
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000417 | Grad Max: 0.001246
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000760 | Grad Max: 0.002623
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013329 | Grad Max: 0.013329
[GRADIENT NORM TOTAL] 1.7064

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.294
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5086065 0.4913935] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 13/2035 | B: 199/1849 | C: 155/1221
[LOSS Ex1] A: 0.67965 | B: 0.68158 | C: 0.68001
[LOGITS Ex2 A] Mean Abs: 1.193 | Max: 6.454
[LOSS Ex2] A: 0.32004 | B: 0.42313 | C: 0.40047
** [JOINT LOSS] ** : 1.061627
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003969 | Grad Max: 0.090749
  -> Layer: shared_layers.0.bias | Grad Mean: 0.193055 | Grad Max: 1.015774
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001917 | Grad Max: 0.009151
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013301 | Grad Max: 0.013301
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001461 | Grad Max: 0.094780
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027796 | Grad Max: 0.521335
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000350 | Grad Max: 0.008721
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012750 | Grad Max: 0.048075
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000781
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002932 | Grad Max: 0.007177
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000427
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001036 | Grad Max: 0.002899
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002314 | Grad Max: 0.004892
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033960 | Grad Max: 0.033960
[GRADIENT NORM TOTAL] 3.7766

[EPOCH SUMMARY] Train Loss: 1.0621

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0382 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 30/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.280
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50008655 0.49991345] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 18/2030 | B: 192/1856 | C: 255/1793
[LOSS Ex1] A: 0.67917 | B: 0.67947 | C: 0.67691
[LOGITS Ex2 A] Mean Abs: 1.177 | Max: 5.122
[LOSS Ex2] A: 0.30692 | B: 0.41917 | C: 0.39634
** [JOINT LOSS] ** : 1.052663
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004070 | Grad Max: 0.109714
  -> Layer: shared_layers.0.bias | Grad Mean: 0.237020 | Grad Max: 1.288446
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.010258
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016537 | Grad Max: 0.016537
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001814 | Grad Max: 0.112255
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034404 | Grad Max: 0.646463
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.011124
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015730 | Grad Max: 0.060841
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000957
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003628 | Grad Max: 0.008484
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000464
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001293 | Grad Max: 0.003371
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003083 | Grad Max: 0.005950
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043872 | Grad Max: 0.043872
[GRADIENT NORM TOTAL] 4.6645

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.287
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5060296 0.4939704] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 18/2030 | B: 180/1676 | C: 225/1823
[LOSS Ex1] A: 0.67829 | B: 0.68153 | C: 0.67916
[LOGITS Ex2 A] Mean Abs: 1.171 | Max: 5.070
[LOSS Ex2] A: 0.33116 | B: 0.40426 | C: 0.38952
** [JOINT LOSS] ** : 1.054638
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002809 | Grad Max: 0.071894
  -> Layer: shared_layers.0.bias | Grad Mean: 0.090289 | Grad Max: 0.421175
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001984 | Grad Max: 0.010030
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014735 | Grad Max: 0.014735
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000748 | Grad Max: 0.057155
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012488 | Grad Max: 0.293412
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000128 | Grad Max: 0.005409
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004497 | Grad Max: 0.026935
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000359
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000978 | Grad Max: 0.003022
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000163
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000331 | Grad Max: 0.001095
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000717 | Grad Max: 0.002535
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010326 | Grad Max: 0.010326
[GRADIENT NORM TOTAL] 1.7572

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.247
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083104  0.49168956] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 15/2033 | B: 182/1866 | C: 264/1784
[LOSS Ex1] A: 0.68071 | B: 0.68114 | C: 0.67543
[LOGITS Ex2 A] Mean Abs: 1.198 | Max: 4.854
[LOSS Ex2] A: 0.32650 | B: 0.44545 | C: 0.38769
** [JOINT LOSS] ** : 1.065639
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004567 | Grad Max: 0.105548
  -> Layer: shared_layers.0.bias | Grad Mean: 0.208715 | Grad Max: 1.154316
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001843 | Grad Max: 0.007668
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000569 | Grad Max: 0.000569
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001642 | Grad Max: 0.099167
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030898 | Grad Max: 0.536921
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000385 | Grad Max: 0.009177
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013950 | Grad Max: 0.045666
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000795
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003214 | Grad Max: 0.007386
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000393
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001141 | Grad Max: 0.003024
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002455 | Grad Max: 0.005099
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037342 | Grad Max: 0.037342
[GRADIENT NORM TOTAL] 4.1197

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.173
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5114407 0.4885593] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.514 | Std: 0.014
[MASKS] A(Pass/Fail): 15/2033 | B: 202/1846 | C: 231/1817
[LOSS Ex1] A: 0.68028 | B: 0.68149 | C: 0.67720
[LOGITS Ex2 A] Mean Abs: 1.190 | Max: 4.756
[LOSS Ex2] A: 0.33469 | B: 0.44359 | C: 0.39194
** [JOINT LOSS] ** : 1.069730
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005903 | Grad Max: 0.160076
  -> Layer: shared_layers.0.bias | Grad Mean: 0.273634 | Grad Max: 1.499787
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001818 | Grad Max: 0.008386
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001382 | Grad Max: 0.001382
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.162097
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040228 | Grad Max: 0.893487
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000500 | Grad Max: 0.011689
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018070 | Grad Max: 0.063294
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.001160
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004152 | Grad Max: 0.010161
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000505
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001480 | Grad Max: 0.003780
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003399 | Grad Max: 0.006581
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048774 | Grad Max: 0.048774
[GRADIENT NORM TOTAL] 5.4433

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.263
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5462029 0.4537971] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.014
[MASKS] A(Pass/Fail): 18/2030 | B: 192/1856 | C: 263/1785
[LOSS Ex1] A: 0.67927 | B: 0.67937 | C: 0.67729
[LOGITS Ex2 A] Mean Abs: 1.210 | Max: 4.907
[LOSS Ex2] A: 0.30830 | B: 0.42318 | C: 0.39039
** [JOINT LOSS] ** : 1.052601
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002902 | Grad Max: 0.071344
  -> Layer: shared_layers.0.bias | Grad Mean: 0.107529 | Grad Max: 0.565970
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002000 | Grad Max: 0.009441
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011218 | Grad Max: 0.011218
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000851 | Grad Max: 0.088288
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015579 | Grad Max: 0.490292
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000191 | Grad Max: 0.005337
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006855 | Grad Max: 0.026738
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000500
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001580 | Grad Max: 0.004059
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000211
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000569 | Grad Max: 0.001572
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001314 | Grad Max: 0.003367
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019569 | Grad Max: 0.019569
[GRADIENT NORM TOTAL] 2.1627

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.296
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50429904 0.49570093] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 16/2032 | B: 182/1674 | C: 224/1824
[LOSS Ex1] A: 0.68117 | B: 0.68144 | C: 0.67896
[LOGITS Ex2 A] Mean Abs: 1.172 | Max: 4.721
[LOSS Ex2] A: 0.31243 | B: 0.41676 | C: 0.40118
** [JOINT LOSS] ** : 1.057312
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005827 | Grad Max: 0.180607
  -> Layer: shared_layers.0.bias | Grad Mean: 0.203659 | Grad Max: 1.107930
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001865 | Grad Max: 0.008779
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013936 | Grad Max: 0.013936
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001748 | Grad Max: 0.125365
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032161 | Grad Max: 0.669681
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000402 | Grad Max: 0.008864
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014432 | Grad Max: 0.048244
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000936
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003346 | Grad Max: 0.008266
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000437
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001182 | Grad Max: 0.003015
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002830 | Grad Max: 0.006201
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039159 | Grad Max: 0.039159
[GRADIENT NORM TOTAL] 4.1647

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.125
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5449559 0.4550441] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 24/2024 | B: 184/1864 | C: 243/1805
[LOSS Ex1] A: 0.67995 | B: 0.68105 | C: 0.67656
[LOGITS Ex2 A] Mean Abs: 1.168 | Max: 5.406
[LOSS Ex2] A: 0.33762 | B: 0.46314 | C: 0.41371
** [JOINT LOSS] ** : 1.084008
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006979 | Grad Max: 0.172315
  -> Layer: shared_layers.0.bias | Grad Mean: 0.306028 | Grad Max: 1.722298
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001885 | Grad Max: 0.008755
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004277 | Grad Max: 0.004277
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002522 | Grad Max: 0.143203
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047561 | Grad Max: 0.782215
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000603 | Grad Max: 0.014887
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021838 | Grad Max: 0.076066
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001163
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005042 | Grad Max: 0.010885
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000604
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001809 | Grad Max: 0.004571
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004366 | Grad Max: 0.008540
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.061752 | Grad Max: 0.061752
[GRADIENT NORM TOTAL] 6.1594

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.170
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5261418  0.47385818] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 18/1598 | B: 205/1843 | C: 251/1797
[LOSS Ex1] A: 0.67913 | B: 0.68140 | C: 0.67621
[LOGITS Ex2 A] Mean Abs: 1.212 | Max: 4.751
[LOSS Ex2] A: 0.30536 | B: 0.44588 | C: 0.39153
** [JOINT LOSS] ** : 1.059838
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005667 | Grad Max: 0.140175
  -> Layer: shared_layers.0.bias | Grad Mean: 0.198704 | Grad Max: 1.125647
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.008797
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000096 | Grad Max: 0.000096
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001658 | Grad Max: 0.136065
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030616 | Grad Max: 0.726014
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000380 | Grad Max: 0.008897
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013579 | Grad Max: 0.044573
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000850
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003149 | Grad Max: 0.007327
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000429
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001128 | Grad Max: 0.002914
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002879 | Grad Max: 0.005669
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039068 | Grad Max: 0.039068
[GRADIENT NORM TOTAL] 4.0025

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085377  0.49146232] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.015
[MASKS] A(Pass/Fail): 17/2031 | B: 194/1854 | C: 265/1783
[LOSS Ex1] A: 0.67945 | B: 0.67927 | C: 0.67580
[LOGITS Ex2 A] Mean Abs: 1.241 | Max: 6.011
[LOSS Ex2] A: 0.32534 | B: 0.42848 | C: 0.38935
** [JOINT LOSS] ** : 1.059228
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003000 | Grad Max: 0.077885
  -> Layer: shared_layers.0.bias | Grad Mean: 0.192528 | Grad Max: 1.077483
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.009814
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013851 | Grad Max: 0.013851
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001459 | Grad Max: 0.105889
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027570 | Grad Max: 0.587453
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000354 | Grad Max: 0.010376
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012985 | Grad Max: 0.055157
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000842
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002955 | Grad Max: 0.007262
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000378
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001049 | Grad Max: 0.002829
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002373 | Grad Max: 0.004702
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034933 | Grad Max: 0.034933
[GRADIENT NORM TOTAL] 3.9181

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.282
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002875  0.49971256] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.015
[MASKS] A(Pass/Fail): 21/2027 | B: 182/1674 | C: 233/1815
[LOSS Ex1] A: 0.67895 | B: 0.68134 | C: 0.67748
[LOGITS Ex2 A] Mean Abs: 1.236 | Max: 4.891
[LOSS Ex2] A: 0.33347 | B: 0.41211 | C: 0.39520
** [JOINT LOSS] ** : 1.059519
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004121 | Grad Max: 0.098908
  -> Layer: shared_layers.0.bias | Grad Mean: 0.215778 | Grad Max: 1.250906
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001997 | Grad Max: 0.009717
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014353 | Grad Max: 0.014353
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001698 | Grad Max: 0.098819
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032427 | Grad Max: 0.562722
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.010094
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014885 | Grad Max: 0.055311
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000910
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003400 | Grad Max: 0.008017
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000423
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001205 | Grad Max: 0.003203
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002722 | Grad Max: 0.005244
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040452 | Grad Max: 0.040452
[GRADIENT NORM TOTAL] 4.3785

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.289
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50588006 0.49411994] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.015
[MASKS] A(Pass/Fail): 22/2026 | B: 188/1860 | C: 261/1787
[LOSS Ex1] A: 0.67806 | B: 0.68096 | C: 0.67642
[LOGITS Ex2 A] Mean Abs: 1.226 | Max: 5.380
[LOSS Ex2] A: 0.32702 | B: 0.44294 | C: 0.39312
** [JOINT LOSS] ** : 1.066174
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004024 | Grad Max: 0.127890
  -> Layer: shared_layers.0.bias | Grad Mean: 0.144040 | Grad Max: 0.739506
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002023 | Grad Max: 0.009825
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011537 | Grad Max: 0.011537
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001202 | Grad Max: 0.074862
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022227 | Grad Max: 0.421676
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000280 | Grad Max: 0.006881
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010073 | Grad Max: 0.035614
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000773
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002325 | Grad Max: 0.005886
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000315
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000816 | Grad Max: 0.002152
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001697 | Grad Max: 0.003494
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026005 | Grad Max: 0.026005
[GRADIENT NORM TOTAL] 2.8615

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.249
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082236  0.49177638] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 17/2031 | B: 206/1842 | C: 228/1820
[LOSS Ex1] A: 0.68053 | B: 0.68131 | C: 0.67728
[LOGITS Ex2 A] Mean Abs: 1.172 | Max: 5.696
[LOSS Ex2] A: 0.30970 | B: 0.43841 | C: 0.39002
** [JOINT LOSS] ** : 1.059088
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004266 | Grad Max: 0.097288
  -> Layer: shared_layers.0.bias | Grad Mean: 0.208735 | Grad Max: 1.154611
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001773 | Grad Max: 0.007983
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003970 | Grad Max: 0.003970
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001637 | Grad Max: 0.097508
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030562 | Grad Max: 0.544967
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.009315
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014047 | Grad Max: 0.051589
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000807
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003228 | Grad Max: 0.007313
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000408
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001148 | Grad Max: 0.002993
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002833 | Grad Max: 0.005663
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039171 | Grad Max: 0.039171
[GRADIENT NORM TOTAL] 4.1371

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.174
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51174    0.48826003] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 20/2028 | B: 194/1854 | C: 231/1817
[LOSS Ex1] A: 0.68010 | B: 0.67918 | C: 0.67915
[LOGITS Ex2 A] Mean Abs: 1.116 | Max: 5.223
[LOSS Ex2] A: 0.32483 | B: 0.42547 | C: 0.40596
** [JOINT LOSS] ** : 1.064898
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005992 | Grad Max: 0.147702
  -> Layer: shared_layers.0.bias | Grad Mean: 0.289235 | Grad Max: 1.571650
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001825 | Grad Max: 0.008702
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006874 | Grad Max: 0.006874
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.130858
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042202 | Grad Max: 0.714703
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000531 | Grad Max: 0.012292
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019330 | Grad Max: 0.066977
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001279
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004454 | Grad Max: 0.010896
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000570
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001580 | Grad Max: 0.003940
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003696 | Grad Max: 0.007465
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053255 | Grad Max: 0.053255
[GRADIENT NORM TOTAL] 5.6507

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.266
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5471337 0.4528663] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 21/2027 | B: 184/1672 | C: 172/1204
[LOSS Ex1] A: 0.67905 | B: 0.68126 | C: 0.67328
[LOGITS Ex2 A] Mean Abs: 1.194 | Max: 5.212
[LOSS Ex2] A: 0.29849 | B: 0.40373 | C: 0.39020
** [JOINT LOSS] ** : 1.042005
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001913 | Grad Max: 0.073212
  -> Layer: shared_layers.0.bias | Grad Mean: 0.153810 | Grad Max: 0.834651
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.009910
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011594 | Grad Max: 0.011594
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001123 | Grad Max: 0.071909
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020812 | Grad Max: 0.393184
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000263 | Grad Max: 0.006526
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009589 | Grad Max: 0.035256
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000684
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002184 | Grad Max: 0.005696
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000292
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000765 | Grad Max: 0.002130
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001750 | Grad Max: 0.004343
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024590 | Grad Max: 0.024590
[GRADIENT NORM TOTAL] 3.0486

[EPOCH SUMMARY] Train Loss: 1.0605

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0329 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0359 -> New: 1.0329)

############################## EPOCH 31/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.298
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041964 0.4958036] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.015
[MASKS] A(Pass/Fail): 18/2030 | B: 188/1860 | C: 222/1826
[LOSS Ex1] A: 0.68100 | B: 0.68087 | C: 0.67744
[LOGITS Ex2 A] Mean Abs: 1.225 | Max: 5.452
[LOSS Ex2] A: 0.29993 | B: 0.44767 | C: 0.38398
** [JOINT LOSS] ** : 1.056964
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003417 | Grad Max: 0.085568
  -> Layer: shared_layers.0.bias | Grad Mean: 0.195300 | Grad Max: 1.076839
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001932 | Grad Max: 0.009004
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013581 | Grad Max: 0.013581
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001494 | Grad Max: 0.093937
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028809 | Grad Max: 0.509675
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000364 | Grad Max: 0.008784
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013335 | Grad Max: 0.047281
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000745
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003022 | Grad Max: 0.006919
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000406
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001070 | Grad Max: 0.002876
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002263 | Grad Max: 0.004933
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035318 | Grad Max: 0.035318
[GRADIENT NORM TOTAL] 3.9071

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.127
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54576504 0.454235  ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.015
[MASKS] A(Pass/Fail): 31/2017 | B: 208/1840 | C: 237/1811
[LOSS Ex1] A: 0.67977 | B: 0.68122 | C: 0.67744
[LOGITS Ex2 A] Mean Abs: 1.242 | Max: 4.904
[LOSS Ex2] A: 0.32321 | B: 0.43693 | C: 0.38471
** [JOINT LOSS] ** : 1.061095
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004404 | Grad Max: 0.108093
  -> Layer: shared_layers.0.bias | Grad Mean: 0.249441 | Grad Max: 1.371514
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001855 | Grad Max: 0.008909
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007726 | Grad Max: 0.007726
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001882 | Grad Max: 0.113334
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035967 | Grad Max: 0.636216
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000446 | Grad Max: 0.011239
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016512 | Grad Max: 0.063049
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.001016
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003785 | Grad Max: 0.009226
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000465
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001342 | Grad Max: 0.003486
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002978 | Grad Max: 0.005968
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044427 | Grad Max: 0.044427
[GRADIENT NORM TOTAL] 4.9256

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.173
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5266981  0.47330186] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 26/1590 | B: 195/1853 | C: 252/1796
[LOSS Ex1] A: 0.67894 | B: 0.67908 | C: 0.67738
[LOGITS Ex2 A] Mean Abs: 1.244 | Max: 5.273
[LOSS Ex2] A: 0.30477 | B: 0.42377 | C: 0.37031
** [JOINT LOSS] ** : 1.044746
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002721 | Grad Max: 0.059979
  -> Layer: shared_layers.0.bias | Grad Mean: 0.114844 | Grad Max: 0.637439
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001965 | Grad Max: 0.009298
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007963 | Grad Max: 0.007963
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000922 | Grad Max: 0.059265
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017516 | Grad Max: 0.337286
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.005656
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008085 | Grad Max: 0.031014
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000532
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001831 | Grad Max: 0.004603
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000275
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000653 | Grad Max: 0.001983
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001316 | Grad Max: 0.003955
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021182 | Grad Max: 0.021182
[GRADIENT NORM TOTAL] 2.3309

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.299
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084773  0.49152276] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 27/2021 | B: 184/1672 | C: 252/1796
[LOSS Ex1] A: 0.67925 | B: 0.68117 | C: 0.67642
[LOGITS Ex2 A] Mean Abs: 1.207 | Max: 6.359
[LOSS Ex2] A: 0.30040 | B: 0.41519 | C: 0.40455
** [JOINT LOSS] ** : 1.052326
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004127 | Grad Max: 0.092690
  -> Layer: shared_layers.0.bias | Grad Mean: 0.192684 | Grad Max: 1.036279
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001986 | Grad Max: 0.009758
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014880 | Grad Max: 0.014880
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001523 | Grad Max: 0.174107
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028635 | Grad Max: 0.980445
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.008239
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012996 | Grad Max: 0.045051
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000754
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002992 | Grad Max: 0.007087
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000375
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001057 | Grad Max: 0.002839
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002441 | Grad Max: 0.004897
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034874 | Grad Max: 0.034874
[GRADIENT NORM TOTAL] 3.9210

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.284
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50049126 0.49950877] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.015
[MASKS] A(Pass/Fail): 31/2017 | B: 189/1859 | C: 233/1815
[LOSS Ex1] A: 0.67874 | B: 0.68078 | C: 0.67615
[LOGITS Ex2 A] Mean Abs: 1.188 | Max: 5.751
[LOSS Ex2] A: 0.31386 | B: 0.45386 | C: 0.41499
** [JOINT LOSS] ** : 1.072794
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006006 | Grad Max: 0.147200
  -> Layer: shared_layers.0.bias | Grad Mean: 0.328629 | Grad Max: 1.802913
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.009507
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008498 | Grad Max: 0.008498
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002574 | Grad Max: 0.233592
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048765 | Grad Max: 1.309110
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.015260
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022340 | Grad Max: 0.081715
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000103 | Grad Max: 0.001241
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005128 | Grad Max: 0.011698
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000601
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001825 | Grad Max: 0.004670
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004233 | Grad Max: 0.007880
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.061297 | Grad Max: 0.061297
[GRADIENT NORM TOTAL] 6.6731

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.291
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5057465  0.49425352] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.015
[MASKS] A(Pass/Fail): 32/2016 | B: 210/1838 | C: 241/1807
[LOSS Ex1] A: 0.67785 | B: 0.68114 | C: 0.67709
[LOGITS Ex2 A] Mean Abs: 1.193 | Max: 5.001
[LOSS Ex2] A: 0.30825 | B: 0.43318 | C: 0.42117
** [JOINT LOSS] ** : 1.066221
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003390 | Grad Max: 0.110160
  -> Layer: shared_layers.0.bias | Grad Mean: 0.200708 | Grad Max: 1.139364
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001954 | Grad Max: 0.009092
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004962 | Grad Max: 0.004962
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001539 | Grad Max: 0.099331
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029211 | Grad Max: 0.561482
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000368 | Grad Max: 0.009440
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013599 | Grad Max: 0.051670
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000728
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003126 | Grad Max: 0.007160
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000391
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001121 | Grad Max: 0.002885
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002767 | Grad Max: 0.005193
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039834 | Grad Max: 0.039834
[GRADIENT NORM TOTAL] 3.9903

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.251
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081375  0.49186242] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.015
[MASKS] A(Pass/Fail): 34/2014 | B: 196/1852 | C: 239/1809
[LOSS Ex1] A: 0.68037 | B: 0.67899 | C: 0.67719
[LOGITS Ex2 A] Mean Abs: 1.215 | Max: 4.746
[LOSS Ex2] A: 0.32435 | B: 0.41596 | C: 0.41264
** [JOINT LOSS] ** : 1.063167
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003271 | Grad Max: 0.087570
  -> Layer: shared_layers.0.bias | Grad Mean: 0.152281 | Grad Max: 0.857749
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001825 | Grad Max: 0.007621
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000495 | Grad Max: 0.000495
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001158 | Grad Max: 0.075780
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021832 | Grad Max: 0.426813
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.006294
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010056 | Grad Max: 0.033897
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000635
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002300 | Grad Max: 0.005646
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000290
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000811 | Grad Max: 0.002145
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001695 | Grad Max: 0.003531
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025831 | Grad Max: 0.025831
[GRADIENT NORM TOTAL] 2.9726

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.175
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5120906  0.48790935] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.014
[MASKS] A(Pass/Fail): 27/2021 | B: 184/1672 | C: 263/1785
[LOSS Ex1] A: 0.67994 | B: 0.68108 | C: 0.67564
[LOGITS Ex2 A] Mean Abs: 1.198 | Max: 5.338
[LOSS Ex2] A: 0.31069 | B: 0.41908 | C: 0.39522
** [JOINT LOSS] ** : 1.053879
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004385 | Grad Max: 0.109784
  -> Layer: shared_layers.0.bias | Grad Mean: 0.196777 | Grad Max: 1.152315
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001830 | Grad Max: 0.008190
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000094 | Grad Max: 0.000094
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001556 | Grad Max: 0.103626
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029252 | Grad Max: 0.588486
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000363 | Grad Max: 0.008831
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013313 | Grad Max: 0.051239
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000777
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003064 | Grad Max: 0.006726
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000386
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001087 | Grad Max: 0.002873
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002373 | Grad Max: 0.005149
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036299 | Grad Max: 0.036299
[GRADIENT NORM TOTAL] 3.9388

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.268
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5480591  0.45194086] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 32/2016 | B: 189/1859 | C: 268/1780
[LOSS Ex1] A: 0.67885 | B: 0.68069 | C: 0.67534
[LOGITS Ex2 A] Mean Abs: 1.209 | Max: 5.100
[LOSS Ex2] A: 0.30081 | B: 0.43593 | C: 0.37239
** [JOINT LOSS] ** : 1.048002
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002261 | Grad Max: 0.055518
  -> Layer: shared_layers.0.bias | Grad Mean: 0.094466 | Grad Max: 0.539727
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002053 | Grad Max: 0.009704
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014012 | Grad Max: 0.014012
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000751 | Grad Max: 0.048266
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013692 | Grad Max: 0.263290
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.004806
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006181 | Grad Max: 0.023445
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000446
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001425 | Grad Max: 0.003678
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000204
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000507 | Grad Max: 0.001412
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000967 | Grad Max: 0.003160
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016071 | Grad Max: 0.016071
[GRADIENT NORM TOTAL] 1.8977

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.301
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50410914 0.49589086] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 33/2015 | B: 213/1835 | C: 256/1792
[LOSS Ex1] A: 0.68083 | B: 0.68105 | C: 0.67639
[LOGITS Ex2 A] Mean Abs: 1.184 | Max: 4.762
[LOSS Ex2] A: 0.29956 | B: 0.44006 | C: 0.40326
** [JOINT LOSS] ** : 1.060383
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006157 | Grad Max: 0.158452
  -> Layer: shared_layers.0.bias | Grad Mean: 0.248653 | Grad Max: 1.309926
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001845 | Grad Max: 0.008583
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010495 | Grad Max: 0.010495
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002005 | Grad Max: 0.114558
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037189 | Grad Max: 0.632352
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.009385
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016570 | Grad Max: 0.054660
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001026
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003827 | Grad Max: 0.009208
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000451
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001355 | Grad Max: 0.003450
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003206 | Grad Max: 0.005978
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045624 | Grad Max: 0.045624
[GRADIENT NORM TOTAL] 4.8789

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.129
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54660416 0.4533958 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 45/2003 | B: 197/1851 | C: 228/1820
[LOSS Ex1] A: 0.67958 | B: 0.67889 | C: 0.67718
[LOGITS Ex2 A] Mean Abs: 1.180 | Max: 4.804
[LOSS Ex2] A: 0.31782 | B: 0.42881 | C: 0.40077
** [JOINT LOSS] ** : 1.061014
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007096 | Grad Max: 0.173807
  -> Layer: shared_layers.0.bias | Grad Mean: 0.321708 | Grad Max: 1.734709
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001943 | Grad Max: 0.008876
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007685 | Grad Max: 0.007685
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002588 | Grad Max: 0.145691
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048846 | Grad Max: 0.785763
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.014008
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022226 | Grad Max: 0.074157
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000103 | Grad Max: 0.001315
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005110 | Grad Max: 0.011972
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000574
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001807 | Grad Max: 0.004615
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004194 | Grad Max: 0.007872
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.060538 | Grad Max: 0.060538
[GRADIENT NORM TOTAL] 6.3849

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.175
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5272953  0.47270474] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 33/1583 | B: 188/1668 | C: 250/1798
[LOSS Ex1] A: 0.67875 | B: 0.68098 | C: 0.67635
[LOGITS Ex2 A] Mean Abs: 1.224 | Max: 5.024
[LOSS Ex2] A: 0.29011 | B: 0.40574 | C: 0.38036
** [JOINT LOSS] ** : 1.037429
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004889 | Grad Max: 0.134210
  -> Layer: shared_layers.0.bias | Grad Mean: 0.239265 | Grad Max: 1.259770
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001915 | Grad Max: 0.009224
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005145 | Grad Max: 0.005145
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001859 | Grad Max: 0.110956
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034861 | Grad Max: 0.612765
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000436 | Grad Max: 0.009723
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016006 | Grad Max: 0.055441
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000970
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003675 | Grad Max: 0.008582
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000427
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001282 | Grad Max: 0.003244
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002885 | Grad Max: 0.006434
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041493 | Grad Max: 0.041493
[GRADIENT NORM TOTAL] 4.7400

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.302
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084324  0.49156755] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 37/2011 | B: 191/1857 | C: 235/1813
[LOSS Ex1] A: 0.67906 | B: 0.68060 | C: 0.67664
[LOGITS Ex2 A] Mean Abs: 1.245 | Max: 5.556
[LOSS Ex2] A: 0.29793 | B: 0.44208 | C: 0.39011
** [JOINT LOSS] ** : 1.055476
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003245 | Grad Max: 0.073972
  -> Layer: shared_layers.0.bias | Grad Mean: 0.138158 | Grad Max: 0.683315
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001954 | Grad Max: 0.009225
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008951 | Grad Max: 0.008951
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001074 | Grad Max: 0.092359
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020124 | Grad Max: 0.525287
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000251 | Grad Max: 0.007029
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009104 | Grad Max: 0.037173
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000541
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002078 | Grad Max: 0.004893
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000275
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000744 | Grad Max: 0.002055
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001707 | Grad Max: 0.004199
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025788 | Grad Max: 0.025788
[GRADIENT NORM TOTAL] 2.7190

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.286
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50070375 0.49929625] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 42/2006 | B: 216/1832 | C: 178/1198
[LOSS Ex1] A: 0.67853 | B: 0.68096 | C: 0.67719
[LOGITS Ex2 A] Mean Abs: 1.249 | Max: 5.199
[LOSS Ex2] A: 0.32893 | B: 0.43407 | C: 0.39717
** [JOINT LOSS] ** : 1.065615
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005185 | Grad Max: 0.113163
  -> Layer: shared_layers.0.bias | Grad Mean: 0.276575 | Grad Max: 1.454878
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.010487
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.021267 | Grad Max: 0.021267
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.147824
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039941 | Grad Max: 0.850868
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000500 | Grad Max: 0.011268
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018366 | Grad Max: 0.061357
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.001072
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004195 | Grad Max: 0.009881
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000467
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001471 | Grad Max: 0.003739
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003119 | Grad Max: 0.005815
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047270 | Grad Max: 0.047270
[GRADIENT NORM TOTAL] 5.4845

[EPOCH SUMMARY] Train Loss: 1.0571

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0270 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0329 -> New: 1.0270)

############################## EPOCH 32/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.294
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056307  0.49436936] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 39/2009 | B: 199/1849 | C: 248/1800
[LOSS Ex1] A: 0.67764 | B: 0.67879 | C: 0.67650
[LOGITS Ex2 A] Mean Abs: 1.231 | Max: 4.799
[LOSS Ex2] A: 0.32637 | B: 0.42379 | C: 0.37957
** [JOINT LOSS] ** : 1.054220
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004254 | Grad Max: 0.133847
  -> Layer: shared_layers.0.bias | Grad Mean: 0.138790 | Grad Max: 0.672072
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.010196
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014212 | Grad Max: 0.014212
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001208 | Grad Max: 0.076862
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022074 | Grad Max: 0.402982
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000270 | Grad Max: 0.006714
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009698 | Grad Max: 0.036333
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000602
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002238 | Grad Max: 0.005728
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000289
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000795 | Grad Max: 0.002141
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001797 | Grad Max: 0.003917
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026501 | Grad Max: 0.026501
[GRADIENT NORM TOTAL] 2.8257

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.253
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080842  0.49191582] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.015
[MASKS] A(Pass/Fail): 47/2001 | B: 188/1668 | C: 236/1812
[LOSS Ex1] A: 0.68021 | B: 0.68089 | C: 0.67757
[LOGITS Ex2 A] Mean Abs: 1.181 | Max: 5.267
[LOSS Ex2] A: 0.29952 | B: 0.41072 | C: 0.41326
** [JOINT LOSS] ** : 1.054057
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004819 | Grad Max: 0.131335
  -> Layer: shared_layers.0.bias | Grad Mean: 0.217381 | Grad Max: 1.158885
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001810 | Grad Max: 0.008320
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007502 | Grad Max: 0.007502
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001693 | Grad Max: 0.120237
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031789 | Grad Max: 0.649333
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000394 | Grad Max: 0.009501
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014372 | Grad Max: 0.051157
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000957
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003300 | Grad Max: 0.008686
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000407
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001162 | Grad Max: 0.002960
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002755 | Grad Max: 0.005121
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039124 | Grad Max: 0.039124
[GRADIENT NORM TOTAL] 4.3293

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.177
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5124284  0.48757157] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.015
[MASKS] A(Pass/Fail): 34/2014 | B: 193/1855 | C: 220/1828
[LOSS Ex1] A: 0.67977 | B: 0.68051 | C: 0.67855
[LOGITS Ex2 A] Mean Abs: 1.141 | Max: 5.659
[LOSS Ex2] A: 0.31375 | B: 0.45825 | C: 0.42091
** [JOINT LOSS] ** : 1.077243
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005994 | Grad Max: 0.147992
  -> Layer: shared_layers.0.bias | Grad Mean: 0.327635 | Grad Max: 1.781703
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001790 | Grad Max: 0.008375
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004725 | Grad Max: 0.004725
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002499 | Grad Max: 0.179392
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047421 | Grad Max: 1.016468
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000597 | Grad Max: 0.014451
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021883 | Grad Max: 0.078354
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001240
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005014 | Grad Max: 0.011019
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000564
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001775 | Grad Max: 0.004333
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004038 | Grad Max: 0.007761
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.058568 | Grad Max: 0.058568
[GRADIENT NORM TOTAL] 6.5328

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.270
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54896295 0.45103708] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 51/1997 | B: 217/1831 | C: 262/1786
[LOSS Ex1] A: 0.67865 | B: 0.68087 | C: 0.67621
[LOGITS Ex2 A] Mean Abs: 1.200 | Max: 5.252
[LOSS Ex2] A: 0.29764 | B: 0.43297 | C: 0.39840
** [JOINT LOSS] ** : 1.054915
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003763 | Grad Max: 0.088303
  -> Layer: shared_layers.0.bias | Grad Mean: 0.171571 | Grad Max: 0.958135
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.009878
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015562 | Grad Max: 0.015562
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001367 | Grad Max: 0.113305
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025285 | Grad Max: 0.637056
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.007155
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011443 | Grad Max: 0.040785
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000695
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002637 | Grad Max: 0.006385
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000367
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000927 | Grad Max: 0.002585
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002163 | Grad Max: 0.004804
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030935 | Grad Max: 0.030935
[GRADIENT NORM TOTAL] 3.4528

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.303
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040041  0.49599588] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 45/2003 | B: 200/1848 | C: 244/1804
[LOSS Ex1] A: 0.68067 | B: 0.67870 | C: 0.67649
[LOGITS Ex2 A] Mean Abs: 1.232 | Max: 4.607
[LOSS Ex2] A: 0.30106 | B: 0.42358 | C: 0.38812
** [JOINT LOSS] ** : 1.049535
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002584 | Grad Max: 0.063042
  -> Layer: shared_layers.0.bias | Grad Mean: 0.167004 | Grad Max: 0.914407
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001912 | Grad Max: 0.008705
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009101 | Grad Max: 0.009101
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001262 | Grad Max: 0.092681
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024275 | Grad Max: 0.516190
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.008294
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011226 | Grad Max: 0.042513
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000738
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002552 | Grad Max: 0.006519
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000311
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000902 | Grad Max: 0.002335
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001893 | Grad Max: 0.004308
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029557 | Grad Max: 0.029557
[GRADIENT NORM TOTAL] 3.4145

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.131
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54739887 0.4526012 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 57/1991 | B: 189/1667 | C: 223/1825
[LOSS Ex1] A: 0.67941 | B: 0.68080 | C: 0.67766
[LOGITS Ex2 A] Mean Abs: 1.258 | Max: 4.911
[LOSS Ex2] A: 0.32472 | B: 0.41985 | C: 0.39012
** [JOINT LOSS] ** : 1.057517
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005848 | Grad Max: 0.135018
  -> Layer: shared_layers.0.bias | Grad Mean: 0.280132 | Grad Max: 1.541836
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001861 | Grad Max: 0.008917
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009173 | Grad Max: 0.009173
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.130846
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040999 | Grad Max: 0.727698
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000508 | Grad Max: 0.012543
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018685 | Grad Max: 0.065764
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001041
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004265 | Grad Max: 0.009522
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000523
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001496 | Grad Max: 0.003890
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003262 | Grad Max: 0.006314
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048734 | Grad Max: 0.048734
[GRADIENT NORM TOTAL] 5.5606

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.178
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52785987 0.47214013] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 44/1572 | B: 193/1855 | C: 253/1795
[LOSS Ex1] A: 0.67857 | B: 0.68042 | C: 0.67516
[LOGITS Ex2 A] Mean Abs: 1.276 | Max: 5.641
[LOSS Ex2] A: 0.28804 | B: 0.43872 | C: 0.37796
** [JOINT LOSS] ** : 1.046291
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002466 | Grad Max: 0.064252
  -> Layer: shared_layers.0.bias | Grad Mean: 0.115404 | Grad Max: 0.608130
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001947 | Grad Max: 0.008439
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000672 | Grad Max: 0.000672
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000892 | Grad Max: 0.052569
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016657 | Grad Max: 0.294588
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.004815
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007526 | Grad Max: 0.026467
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000506
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001719 | Grad Max: 0.004588
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000250
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000607 | Grad Max: 0.001850
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001162 | Grad Max: 0.003135
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019262 | Grad Max: 0.019262
[GRADIENT NORM TOTAL] 2.2708

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.304
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083601 0.4916399] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 53/1995 | B: 219/1829 | C: 251/1797
[LOSS Ex1] A: 0.67888 | B: 0.68078 | C: 0.67703
[LOGITS Ex2 A] Mean Abs: 1.223 | Max: 5.545
[LOSS Ex2] A: 0.30885 | B: 0.43240 | C: 0.38462
** [JOINT LOSS] ** : 1.054186
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003776 | Grad Max: 0.086451
  -> Layer: shared_layers.0.bias | Grad Mean: 0.181745 | Grad Max: 0.982500
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001946 | Grad Max: 0.009336
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014138 | Grad Max: 0.014138
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001422 | Grad Max: 0.084964
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026804 | Grad Max: 0.487424
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.008445
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012240 | Grad Max: 0.045138
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000794
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002801 | Grad Max: 0.006673
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000352
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000978 | Grad Max: 0.002518
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002215 | Grad Max: 0.004526
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032520 | Grad Max: 0.032520
[GRADIENT NORM TOTAL] 3.6161

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.288
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50092363 0.49907637] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 57/1991 | B: 203/1845 | C: 249/1799
[LOSS Ex1] A: 0.67833 | B: 0.67860 | C: 0.67487
[LOGITS Ex2 A] Mean Abs: 1.206 | Max: 5.177
[LOSS Ex2] A: 0.30395 | B: 0.42996 | C: 0.40267
** [JOINT LOSS] ** : 1.056128
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005117 | Grad Max: 0.128874
  -> Layer: shared_layers.0.bias | Grad Mean: 0.268606 | Grad Max: 1.429977
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.009081
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004166 | Grad Max: 0.004166
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.124443
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039494 | Grad Max: 0.712641
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000499 | Grad Max: 0.011612
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018357 | Grad Max: 0.063062
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000984
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004193 | Grad Max: 0.008973
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000518
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001478 | Grad Max: 0.003913
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003409 | Grad Max: 0.006406
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.049874 | Grad Max: 0.049874
[GRADIENT NORM TOTAL] 5.3279

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.296
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5054748  0.49452516] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 54/1994 | B: 190/1666 | C: 247/1801
[LOSS Ex1] A: 0.67744 | B: 0.68071 | C: 0.67687
[LOGITS Ex2 A] Mean Abs: 1.213 | Max: 4.758
[LOSS Ex2] A: 0.30593 | B: 0.41218 | C: 0.38751
** [JOINT LOSS] ** : 1.046885
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001777 | Grad Max: 0.066097
  -> Layer: shared_layers.0.bias | Grad Mean: 0.136360 | Grad Max: 0.737066
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001938 | Grad Max: 0.009407
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007350 | Grad Max: 0.007350
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001024 | Grad Max: 0.069543
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019017 | Grad Max: 0.385829
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000242 | Grad Max: 0.007040
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008896 | Grad Max: 0.036149
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000577
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002008 | Grad Max: 0.005121
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000270
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000699 | Grad Max: 0.001985
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001560 | Grad Max: 0.003662
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023276 | Grad Max: 0.023276
[GRADIENT NORM TOTAL] 2.7866

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.255
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50798726 0.4920127 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.515 | Std: 0.015
[MASKS] A(Pass/Fail): 53/1995 | B: 194/1854 | C: 269/1779
[LOSS Ex1] A: 0.68006 | B: 0.68033 | C: 0.67453
[LOGITS Ex2 A] Mean Abs: 1.232 | Max: 4.973
[LOSS Ex2] A: 0.31151 | B: 0.44287 | C: 0.38155
** [JOINT LOSS] ** : 1.056948
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004511 | Grad Max: 0.104847
  -> Layer: shared_layers.0.bias | Grad Mean: 0.220027 | Grad Max: 1.135377
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001870 | Grad Max: 0.008255
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004976 | Grad Max: 0.004976
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001670 | Grad Max: 0.130665
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031685 | Grad Max: 0.724810
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000393 | Grad Max: 0.009896
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014472 | Grad Max: 0.051796
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000843
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003297 | Grad Max: 0.008078
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000419
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001153 | Grad Max: 0.003157
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002315 | Grad Max: 0.005059
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037152 | Grad Max: 0.037152
[GRADIENT NORM TOTAL] 4.3321

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.178
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5127431  0.48725685] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.515 | Std: 0.015
[MASKS] A(Pass/Fail): 44/2004 | B: 219/1829 | C: 248/1800
[LOSS Ex1] A: 0.67961 | B: 0.68070 | C: 0.67476
[LOGITS Ex2 A] Mean Abs: 1.222 | Max: 5.107
[LOSS Ex2] A: 0.30954 | B: 0.43821 | C: 0.36875
** [JOINT LOSS] ** : 1.050525
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006261 | Grad Max: 0.166374
  -> Layer: shared_layers.0.bias | Grad Mean: 0.303253 | Grad Max: 1.609333
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001881 | Grad Max: 0.008107
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000737 | Grad Max: 0.000737
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002290 | Grad Max: 0.173676
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043460 | Grad Max: 0.959827
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.011908
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019728 | Grad Max: 0.065602
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001067
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004518 | Grad Max: 0.010338
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000045 | Grad Max: 0.000518
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001591 | Grad Max: 0.004181
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003422 | Grad Max: 0.007396
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051882 | Grad Max: 0.051882
[GRADIENT NORM TOTAL] 5.9436

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.273
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5498703  0.45012966] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.016
[MASKS] A(Pass/Fail): 69/1979 | B: 205/1843 | C: 254/1794
[LOSS Ex1] A: 0.67845 | B: 0.67851 | C: 0.67611
[LOGITS Ex2 A] Mean Abs: 1.247 | Max: 5.032
[LOSS Ex2] A: 0.29802 | B: 0.41839 | C: 0.36690
** [JOINT LOSS] ** : 1.038789
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004246 | Grad Max: 0.108390
  -> Layer: shared_layers.0.bias | Grad Mean: 0.197219 | Grad Max: 1.045097
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.009911
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014772 | Grad Max: 0.014772
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001498 | Grad Max: 0.106560
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028120 | Grad Max: 0.586394
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000338 | Grad Max: 0.008426
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012389 | Grad Max: 0.044961
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000762
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002840 | Grad Max: 0.007175
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000351
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001009 | Grad Max: 0.002791
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002183 | Grad Max: 0.005408
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033742 | Grad Max: 0.033742
[GRADIENT NORM TOTAL] 3.8652

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.305
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50391847 0.4960815 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 68/1980 | B: 191/1665 | C: 165/1211
[LOSS Ex1] A: 0.68050 | B: 0.68062 | C: 0.67673
[LOGITS Ex2 A] Mean Abs: 1.219 | Max: 5.037
[LOSS Ex2] A: 0.29089 | B: 0.41239 | C: 0.35073
** [JOINT LOSS] ** : 1.030624
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004337 | Grad Max: 0.141622
  -> Layer: shared_layers.0.bias | Grad Mean: 0.135562 | Grad Max: 0.638980
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001892 | Grad Max: 0.008707
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012493 | Grad Max: 0.012493
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001135 | Grad Max: 0.073726
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020393 | Grad Max: 0.391761
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000241 | Grad Max: 0.005883
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008709 | Grad Max: 0.029622
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000659
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002037 | Grad Max: 0.005427
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000293
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000702 | Grad Max: 0.002070
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001676 | Grad Max: 0.004671
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022305 | Grad Max: 0.022305
[GRADIENT NORM TOTAL] 2.6829

[EPOCH SUMMARY] Train Loss: 1.0520

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0298 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 33/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.133
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54816884 0.45183116] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 80/1968 | B: 195/1853 | C: 246/1802
[LOSS Ex1] A: 0.67923 | B: 0.68024 | C: 0.67588
[LOGITS Ex2 A] Mean Abs: 1.202 | Max: 5.613
[LOSS Ex2] A: 0.32046 | B: 0.43774 | C: 0.39544
** [JOINT LOSS] ** : 1.062999
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005552 | Grad Max: 0.130305
  -> Layer: shared_layers.0.bias | Grad Mean: 0.244885 | Grad Max: 1.295592
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.008994
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009318 | Grad Max: 0.009318
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001926 | Grad Max: 0.128464
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036083 | Grad Max: 0.724726
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000441 | Grad Max: 0.009518
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016181 | Grad Max: 0.057397
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000920
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003717 | Grad Max: 0.008377
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000427
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001311 | Grad Max: 0.003322
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003126 | Grad Max: 0.006227
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044338 | Grad Max: 0.044338
[GRADIENT NORM TOTAL] 4.8468

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.180
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52836955 0.47163045] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 60/1556 | B: 220/1828 | C: 253/1795
[LOSS Ex1] A: 0.67839 | B: 0.68061 | C: 0.67621
[LOGITS Ex2 A] Mean Abs: 1.249 | Max: 6.098
[LOSS Ex2] A: 0.29669 | B: 0.42841 | C: 0.36481
** [JOINT LOSS] ** : 1.041704
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002745 | Grad Max: 0.071323
  -> Layer: shared_layers.0.bias | Grad Mean: 0.103243 | Grad Max: 0.523793
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.009401
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008860 | Grad Max: 0.008860
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000769 | Grad Max: 0.056745
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014154 | Grad Max: 0.316267
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000164 | Grad Max: 0.004441
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006008 | Grad Max: 0.022298
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000428
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001391 | Grad Max: 0.003584
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000207
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000488 | Grad Max: 0.001560
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001280 | Grad Max: 0.003554
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016847 | Grad Max: 0.016847
[GRADIENT NORM TOTAL] 1.9961

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.306
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50831884 0.4916812 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 72/1976 | B: 208/1840 | C: 272/1776
[LOSS Ex1] A: 0.67869 | B: 0.67841 | C: 0.67474
[LOGITS Ex2 A] Mean Abs: 1.271 | Max: 6.144
[LOSS Ex2] A: 0.30764 | B: 0.42526 | C: 0.37084
** [JOINT LOSS] ** : 1.045197
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003992 | Grad Max: 0.100650
  -> Layer: shared_layers.0.bias | Grad Mean: 0.226475 | Grad Max: 1.243527
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002046 | Grad Max: 0.009223
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007982 | Grad Max: 0.007982
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001744 | Grad Max: 0.105278
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033224 | Grad Max: 0.605087
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.010091
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015463 | Grad Max: 0.056643
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000882
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003524 | Grad Max: 0.007848
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000429
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001242 | Grad Max: 0.003231
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002643 | Grad Max: 0.005376
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041191 | Grad Max: 0.041191
[GRADIENT NORM TOTAL] 4.5598

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.291
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50108284 0.4989171 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 76/1972 | B: 192/1664 | C: 255/1793
[LOSS Ex1] A: 0.67813 | B: 0.68053 | C: 0.67486
[LOGITS Ex2 A] Mean Abs: 1.286 | Max: 5.045
[LOSS Ex2] A: 0.32379 | B: 0.42210 | C: 0.38527
** [JOINT LOSS] ** : 1.054898
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006616 | Grad Max: 0.172676
  -> Layer: shared_layers.0.bias | Grad Mean: 0.314607 | Grad Max: 1.721980
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002035 | Grad Max: 0.009746
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011769 | Grad Max: 0.011769
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002433 | Grad Max: 0.154182
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046256 | Grad Max: 0.836916
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000582 | Grad Max: 0.015833
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021440 | Grad Max: 0.081567
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001210
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004879 | Grad Max: 0.011169
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000575
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001714 | Grad Max: 0.004432
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003889 | Grad Max: 0.007024
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.056978 | Grad Max: 0.056978
[GRADIENT NORM TOTAL] 6.2041

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.298
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50535256 0.49464747] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 81/1967 | B: 195/1853 | C: 222/1826
[LOSS Ex1] A: 0.67724 | B: 0.68015 | C: 0.67556
[LOGITS Ex2 A] Mean Abs: 1.261 | Max: 5.330
[LOSS Ex2] A: 0.31586 | B: 0.44300 | C: 0.37484
** [JOINT LOSS] ** : 1.055552
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004747 | Grad Max: 0.147704
  -> Layer: shared_layers.0.bias | Grad Mean: 0.148851 | Grad Max: 0.754299
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.009086
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003902 | Grad Max: 0.003902
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001262 | Grad Max: 0.069373
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022916 | Grad Max: 0.373530
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000276 | Grad Max: 0.006831
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009929 | Grad Max: 0.033714
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000673
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002298 | Grad Max: 0.005842
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000308
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000811 | Grad Max: 0.002270
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001833 | Grad Max: 0.004026
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027067 | Grad Max: 0.027067
[GRADIENT NORM TOTAL] 2.9925

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.256
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5079285 0.4920715] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 70/1978 | B: 220/1828 | C: 237/1811
[LOSS Ex1] A: 0.67990 | B: 0.68052 | C: 0.67540
[LOGITS Ex2 A] Mean Abs: 1.196 | Max: 4.874
[LOSS Ex2] A: 0.30772 | B: 0.43049 | C: 0.37926
** [JOINT LOSS] ** : 1.051095
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004066 | Grad Max: 0.111483
  -> Layer: shared_layers.0.bias | Grad Mean: 0.235180 | Grad Max: 1.311640
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001812 | Grad Max: 0.007516
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001296 | Grad Max: 0.001296
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001827 | Grad Max: 0.109844
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034423 | Grad Max: 0.622982
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000430 | Grad Max: 0.011358
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015915 | Grad Max: 0.062132
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.001030
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003622 | Grad Max: 0.009061
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000406
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001262 | Grad Max: 0.003230
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002823 | Grad Max: 0.005943
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041818 | Grad Max: 0.041818
[GRADIENT NORM TOTAL] 4.7520

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.179
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5129882  0.48701182] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.015
[MASKS] A(Pass/Fail): 63/1985 | B: 211/1837 | C: 235/1813
[LOSS Ex1] A: 0.67945 | B: 0.67831 | C: 0.67781
[LOGITS Ex2 A] Mean Abs: 1.174 | Max: 5.402
[LOSS Ex2] A: 0.30319 | B: 0.42517 | C: 0.43304
** [JOINT LOSS] ** : 1.065657
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006013 | Grad Max: 0.141921
  -> Layer: shared_layers.0.bias | Grad Mean: 0.313207 | Grad Max: 1.685053
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001849 | Grad Max: 0.008414
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002721 | Grad Max: 0.002721
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002451 | Grad Max: 0.149744
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046211 | Grad Max: 0.854790
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000578 | Grad Max: 0.016693
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021360 | Grad Max: 0.083542
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000096 | Grad Max: 0.001172
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004883 | Grad Max: 0.010560
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000524
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001716 | Grad Max: 0.004145
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003893 | Grad Max: 0.007207
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057424 | Grad Max: 0.057424
[GRADIENT NORM TOTAL] 6.2778

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.275
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55076176 0.44923827] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.016
[MASKS] A(Pass/Fail): 92/1956 | B: 193/1663 | C: 255/1793
[LOSS Ex1] A: 0.67826 | B: 0.68044 | C: 0.67458
[LOGITS Ex2 A] Mean Abs: 1.232 | Max: 5.110
[LOSS Ex2] A: 0.28997 | B: 0.41956 | C: 0.38410
** [JOINT LOSS] ** : 1.042304
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003229 | Grad Max: 0.087235
  -> Layer: shared_layers.0.bias | Grad Mean: 0.172311 | Grad Max: 0.910035
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.009594
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012879 | Grad Max: 0.012879
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001366 | Grad Max: 0.107805
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025463 | Grad Max: 0.601625
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000319 | Grad Max: 0.007576
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011800 | Grad Max: 0.043179
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000788
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002692 | Grad Max: 0.006289
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000326
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000933 | Grad Max: 0.002560
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002092 | Grad Max: 0.004458
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030373 | Grad Max: 0.030373
[GRADIENT NORM TOTAL] 3.4894

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.308
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50381637 0.49618366] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 78/1970 | B: 196/1852 | C: 239/1809
[LOSS Ex1] A: 0.68035 | B: 0.68006 | C: 0.67620
[LOGITS Ex2 A] Mean Abs: 1.262 | Max: 4.980
[LOSS Ex2] A: 0.29957 | B: 0.43606 | C: 0.39014
** [JOINT LOSS] ** : 1.054122
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003148 | Grad Max: 0.069340
  -> Layer: shared_layers.0.bias | Grad Mean: 0.177863 | Grad Max: 0.968507
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001846 | Grad Max: 0.008372
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009079 | Grad Max: 0.009079
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001315 | Grad Max: 0.085110
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025155 | Grad Max: 0.478028
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000309 | Grad Max: 0.007631
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011539 | Grad Max: 0.040648
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000694
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002596 | Grad Max: 0.006296
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000303
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000901 | Grad Max: 0.002408
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001819 | Grad Max: 0.003961
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028515 | Grad Max: 0.028515
[GRADIENT NORM TOTAL] 3.5395

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.135
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5488888  0.45111117] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 92/1956 | B: 220/1828 | C: 241/1807
[LOSS Ex1] A: 0.67906 | B: 0.68043 | C: 0.67802
[LOGITS Ex2 A] Mean Abs: 1.277 | Max: 4.860
[LOSS Ex2] A: 0.31183 | B: 0.44001 | C: 0.39232
** [JOINT LOSS] ** : 1.060558
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004460 | Grad Max: 0.122717
  -> Layer: shared_layers.0.bias | Grad Mean: 0.295780 | Grad Max: 1.647182
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001875 | Grad Max: 0.008864
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011857 | Grad Max: 0.011857
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.133095
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041453 | Grad Max: 0.754487
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000505 | Grad Max: 0.013667
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018886 | Grad Max: 0.074256
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.001110
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004261 | Grad Max: 0.010375
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000472
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001483 | Grad Max: 0.003739
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003175 | Grad Max: 0.005972
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048011 | Grad Max: 0.048011
[GRADIENT NORM TOTAL] 5.8857

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.183
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5288698  0.47113016] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.016
[MASKS] A(Pass/Fail): 74/1542 | B: 212/1836 | C: 253/1795
[LOSS Ex1] A: 0.67821 | B: 0.67821 | C: 0.67607
[LOGITS Ex2 A] Mean Abs: 1.296 | Max: 5.046
[LOSS Ex2] A: 0.28063 | B: 0.41805 | C: 0.37991
** [JOINT LOSS] ** : 1.037028
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002049 | Grad Max: 0.042310
  -> Layer: shared_layers.0.bias | Grad Mean: 0.108343 | Grad Max: 0.595349
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001985 | Grad Max: 0.008992
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004178 | Grad Max: 0.004178
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000833 | Grad Max: 0.059042
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015913 | Grad Max: 0.332159
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000195 | Grad Max: 0.005024
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007246 | Grad Max: 0.026657
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000496
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001629 | Grad Max: 0.003978
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000209
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000570 | Grad Max: 0.001601
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001090 | Grad Max: 0.003202
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018355 | Grad Max: 0.018355
[GRADIENT NORM TOTAL] 2.2056

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.309
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50829744 0.49170256] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 96/1952 | B: 193/1663 | C: 228/1820
[LOSS Ex1] A: 0.67851 | B: 0.68035 | C: 0.67774
[LOGITS Ex2 A] Mean Abs: 1.260 | Max: 4.767
[LOSS Ex2] A: 0.29878 | B: 0.41421 | C: 0.41359
** [JOINT LOSS] ** : 1.054395
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004270 | Grad Max: 0.113107
  -> Layer: shared_layers.0.bias | Grad Mean: 0.195671 | Grad Max: 1.004288
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001963 | Grad Max: 0.009674
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015278 | Grad Max: 0.015278
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.090775
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027186 | Grad Max: 0.483506
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000346 | Grad Max: 0.008042
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012803 | Grad Max: 0.047159
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000855
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002925 | Grad Max: 0.007709
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000377
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001018 | Grad Max: 0.002786
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002200 | Grad Max: 0.004331
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033106 | Grad Max: 0.033106
[GRADIENT NORM TOTAL] 3.7783

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.293
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50119853 0.49880144] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 91/1957 | B: 197/1851 | C: 242/1806
[LOSS Ex1] A: 0.67794 | B: 0.67996 | C: 0.67650
[LOGITS Ex2 A] Mean Abs: 1.224 | Max: 5.290
[LOSS Ex2] A: 0.30114 | B: 0.44408 | C: 0.40978
** [JOINT LOSS] ** : 1.063134
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004819 | Grad Max: 0.131888
  -> Layer: shared_layers.0.bias | Grad Mean: 0.305033 | Grad Max: 1.576498
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.009940
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015355 | Grad Max: 0.015355
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002263 | Grad Max: 0.158574
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043038 | Grad Max: 0.888311
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000539 | Grad Max: 0.011950
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020013 | Grad Max: 0.069996
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004550 | Grad Max: 0.011412
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000551
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001600 | Grad Max: 0.004128
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003643 | Grad Max: 0.006800
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053056 | Grad Max: 0.053056
[GRADIENT NORM TOTAL] 6.0458

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.301
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50525594 0.494744  ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 103/1945 | B: 220/1828 | C: 195/1181
[LOSS Ex1] A: 0.67705 | B: 0.68034 | C: 0.67382
[LOGITS Ex2 A] Mean Abs: 1.231 | Max: 4.935
[LOSS Ex2] A: 0.30742 | B: 0.42694 | C: 0.38694
** [JOINT LOSS] ** : 1.050838
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002588 | Grad Max: 0.077523
  -> Layer: shared_layers.0.bias | Grad Mean: 0.163215 | Grad Max: 0.890655
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.009295
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005746 | Grad Max: 0.005746
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001179 | Grad Max: 0.083747
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022145 | Grad Max: 0.458802
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000286 | Grad Max: 0.008649
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010636 | Grad Max: 0.046678
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000613
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002387 | Grad Max: 0.006018
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000332
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000835 | Grad Max: 0.002456
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001941 | Grad Max: 0.004187
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028131 | Grad Max: 0.028131
[GRADIENT NORM TOTAL] 3.1367

[EPOCH SUMMARY] Train Loss: 1.0528

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0260 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0270 -> New: 1.0260)

############################## EPOCH 34/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.259
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078804  0.49211955] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 93/1955 | B: 214/1834 | C: 250/1798
[LOSS Ex1] A: 0.67975 | B: 0.67811 | C: 0.67517
[LOGITS Ex2 A] Mean Abs: 1.245 | Max: 4.950
[LOSS Ex2] A: 0.31647 | B: 0.42006 | C: 0.39188
** [JOINT LOSS] ** : 1.053814
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004379 | Grad Max: 0.107455
  -> Layer: shared_layers.0.bias | Grad Mean: 0.234408 | Grad Max: 1.260329
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001893 | Grad Max: 0.007699
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000112 | Grad Max: 0.000112
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001736 | Grad Max: 0.121082
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032792 | Grad Max: 0.691212
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000399 | Grad Max: 0.009938
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014708 | Grad Max: 0.055558
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000801
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003331 | Grad Max: 0.007518
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000376
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001152 | Grad Max: 0.002959
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002353 | Grad Max: 0.004653
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035971 | Grad Max: 0.035971
[GRADIENT NORM TOTAL] 4.5896

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.180
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5132211  0.48677894] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 81/1967 | B: 197/1659 | C: 238/1810
[LOSS Ex1] A: 0.67930 | B: 0.68026 | C: 0.67710
[LOGITS Ex2 A] Mean Abs: 1.247 | Max: 5.412
[LOSS Ex2] A: 0.31126 | B: 0.42162 | C: 0.36530
** [JOINT LOSS] ** : 1.044942
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006758 | Grad Max: 0.150611
  -> Layer: shared_layers.0.bias | Grad Mean: 0.318463 | Grad Max: 1.672163
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001849 | Grad Max: 0.008999
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008211 | Grad Max: 0.008211
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002457 | Grad Max: 0.173345
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046410 | Grad Max: 0.985597
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000572 | Grad Max: 0.012027
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021163 | Grad Max: 0.070962
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.001267
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004822 | Grad Max: 0.011671
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000519
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001687 | Grad Max: 0.004112
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003717 | Grad Max: 0.007253
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.055991 | Grad Max: 0.055991
[GRADIENT NORM TOTAL] 6.2287

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.278
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55162746 0.44837254] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.016
[MASKS] A(Pass/Fail): 108/1940 | B: 200/1848 | C: 238/1810
[LOSS Ex1] A: 0.67807 | B: 0.67987 | C: 0.67638
[LOGITS Ex2 A] Mean Abs: 1.267 | Max: 5.032
[LOSS Ex2] A: 0.29426 | B: 0.44134 | C: 0.39657
** [JOINT LOSS] ** : 1.055498
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004446 | Grad Max: 0.100987
  -> Layer: shared_layers.0.bias | Grad Mean: 0.203608 | Grad Max: 1.066842
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.009863
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.019461 | Grad Max: 0.019461
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001554 | Grad Max: 0.106671
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029204 | Grad Max: 0.595119
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000362 | Grad Max: 0.009320
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013371 | Grad Max: 0.049809
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000835
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003036 | Grad Max: 0.007745
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000341
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001050 | Grad Max: 0.002750
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002146 | Grad Max: 0.004318
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033270 | Grad Max: 0.033270
[GRADIENT NORM TOTAL] 3.9989

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.311
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50373113 0.49626887] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 91/1957 | B: 222/1826 | C: 254/1794
[LOSS Ex1] A: 0.68019 | B: 0.68025 | C: 0.67549
[LOGITS Ex2 A] Mean Abs: 1.221 | Max: 5.472
[LOSS Ex2] A: 0.29455 | B: 0.42995 | C: 0.39920
** [JOINT LOSS] ** : 1.053205
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004035 | Grad Max: 0.107047
  -> Layer: shared_layers.0.bias | Grad Mean: 0.157306 | Grad Max: 0.876459
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001803 | Grad Max: 0.008117
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005238 | Grad Max: 0.005238
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001286 | Grad Max: 0.112681
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023839 | Grad Max: 0.612380
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000277 | Grad Max: 0.006064
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010242 | Grad Max: 0.032137
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000560
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002358 | Grad Max: 0.005639
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000286
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000818 | Grad Max: 0.002310
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001879 | Grad Max: 0.003717
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026869 | Grad Max: 0.026869
[GRADIENT NORM TOTAL] 3.2339

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.137
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5496462  0.45035383] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 100/1948 | B: 216/1832 | C: 258/1790
[LOSS Ex1] A: 0.67889 | B: 0.67802 | C: 0.67451
[LOGITS Ex2 A] Mean Abs: 1.228 | Max: 4.821
[LOSS Ex2] A: 0.30973 | B: 0.42393 | C: 0.38380
** [JOINT LOSS] ** : 1.049628
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005211 | Grad Max: 0.128521
  -> Layer: shared_layers.0.bias | Grad Mean: 0.224957 | Grad Max: 1.219501
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001992 | Grad Max: 0.008603
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006300 | Grad Max: 0.006300
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001809 | Grad Max: 0.125563
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034334 | Grad Max: 0.702602
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.010143
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015348 | Grad Max: 0.054735
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000947
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003490 | Grad Max: 0.008752
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000399
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001211 | Grad Max: 0.003059
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002809 | Grad Max: 0.005480
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040086 | Grad Max: 0.040086
[GRADIENT NORM TOTAL] 4.5360

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.185
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52938104 0.47061896] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.016
[MASKS] A(Pass/Fail): 88/1528 | B: 198/1658 | C: 266/1782
[LOSS Ex1] A: 0.67804 | B: 0.68017 | C: 0.67467
[LOGITS Ex2 A] Mean Abs: 1.278 | Max: 4.783
[LOSS Ex2] A: 0.29570 | B: 0.40370 | C: 0.37821
** [JOINT LOSS] ** : 1.036828
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002966 | Grad Max: 0.068507
  -> Layer: shared_layers.0.bias | Grad Mean: 0.131578 | Grad Max: 0.702732
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001956 | Grad Max: 0.008981
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003753 | Grad Max: 0.003753
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001043 | Grad Max: 0.070976
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019105 | Grad Max: 0.395847
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000236 | Grad Max: 0.005800
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008676 | Grad Max: 0.030583
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000534
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001988 | Grad Max: 0.005115
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000244
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000686 | Grad Max: 0.001861
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001579 | Grad Max: 0.003527
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022705 | Grad Max: 0.022705
[GRADIENT NORM TOTAL] 2.6316

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.311
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50825953 0.4917405 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 116/1932 | B: 201/1847 | C: 254/1794
[LOSS Ex1] A: 0.67833 | B: 0.67979 | C: 0.67535
[LOGITS Ex2 A] Mean Abs: 1.292 | Max: 6.410
[LOSS Ex2] A: 0.30903 | B: 0.44808 | C: 0.39160
** [JOINT LOSS] ** : 1.060727
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005096 | Grad Max: 0.115786
  -> Layer: shared_layers.0.bias | Grad Mean: 0.253520 | Grad Max: 1.309974
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001958 | Grad Max: 0.009071
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010084 | Grad Max: 0.010084
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001926 | Grad Max: 0.123927
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036408 | Grad Max: 0.713114
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000456 | Grad Max: 0.010030
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016919 | Grad Max: 0.058159
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.001057
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003843 | Grad Max: 0.009263
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000420
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001327 | Grad Max: 0.003427
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002697 | Grad Max: 0.005082
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041661 | Grad Max: 0.041661
[GRADIENT NORM TOTAL] 4.9356

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.295
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50133944 0.4986606 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.016
[MASKS] A(Pass/Fail): 105/1943 | B: 226/1822 | C: 242/1806
[LOSS Ex1] A: 0.67775 | B: 0.68017 | C: 0.67619
[LOGITS Ex2 A] Mean Abs: 1.282 | Max: 4.754
[LOSS Ex2] A: 0.31669 | B: 0.44737 | C: 0.38440
** [JOINT LOSS] ** : 1.060854
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005837 | Grad Max: 0.146125
  -> Layer: shared_layers.0.bias | Grad Mean: 0.318799 | Grad Max: 1.695800
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001949 | Grad Max: 0.009043
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008657 | Grad Max: 0.008657
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002414 | Grad Max: 0.163299
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046000 | Grad Max: 0.908077
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000567 | Grad Max: 0.013148
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021115 | Grad Max: 0.076091
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001214
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004787 | Grad Max: 0.010540
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000540
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001657 | Grad Max: 0.004150
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003548 | Grad Max: 0.006647
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053134 | Grad Max: 0.053134
[GRADIENT NORM TOTAL] 6.3412

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.303
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.505163 0.494837] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.016
[MASKS] A(Pass/Fail): 117/1931 | B: 219/1829 | C: 246/1802
[LOSS Ex1] A: 0.67685 | B: 0.67793 | C: 0.67626
[LOGITS Ex2 A] Mean Abs: 1.278 | Max: 4.819
[LOSS Ex2] A: 0.31787 | B: 0.41816 | C: 0.38984
** [JOINT LOSS] ** : 1.052305
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005502 | Grad Max: 0.168970
  -> Layer: shared_layers.0.bias | Grad Mean: 0.209254 | Grad Max: 1.060540
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002032 | Grad Max: 0.009225
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005864 | Grad Max: 0.005864
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001730 | Grad Max: 0.109938
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031827 | Grad Max: 0.605209
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.009808
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014230 | Grad Max: 0.052972
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000751
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003264 | Grad Max: 0.007411
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000374
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001136 | Grad Max: 0.002903
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002436 | Grad Max: 0.004883
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036425 | Grad Max: 0.036425
[GRADIENT NORM TOTAL] 4.1644

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.260
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078257  0.49217436] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 103/1945 | B: 200/1656 | C: 236/1812
[LOSS Ex1] A: 0.67959 | B: 0.68009 | C: 0.67711
[LOGITS Ex2 A] Mean Abs: 1.216 | Max: 4.896
[LOSS Ex2] A: 0.29993 | B: 0.40986 | C: 0.36804
** [JOINT LOSS] ** : 1.038203
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003742 | Grad Max: 0.081576
  -> Layer: shared_layers.0.bias | Grad Mean: 0.170897 | Grad Max: 0.868119
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001785 | Grad Max: 0.008026
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007177 | Grad Max: 0.007177
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001318 | Grad Max: 0.151885
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024702 | Grad Max: 0.851390
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000300 | Grad Max: 0.008340
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011092 | Grad Max: 0.043843
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000744
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002526 | Grad Max: 0.006176
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000304
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000860 | Grad Max: 0.002336
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001890 | Grad Max: 0.004368
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027059 | Grad Max: 0.027059
[GRADIENT NORM TOTAL] 3.4759

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.181
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51343113 0.4865689 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 89/1959 | B: 202/1846 | C: 247/1801
[LOSS Ex1] A: 0.67914 | B: 0.67970 | C: 0.67476
[LOGITS Ex2 A] Mean Abs: 1.170 | Max: 5.256
[LOSS Ex2] A: 0.30040 | B: 0.45027 | C: 0.37551
** [JOINT LOSS] ** : 1.053261
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005384 | Grad Max: 0.119924
  -> Layer: shared_layers.0.bias | Grad Mean: 0.298249 | Grad Max: 1.579906
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001894 | Grad Max: 0.008577
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003173 | Grad Max: 0.003173
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002280 | Grad Max: 0.141180
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042991 | Grad Max: 0.812237
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000533 | Grad Max: 0.012499
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019751 | Grad Max: 0.068227
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.001123
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004478 | Grad Max: 0.010352
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000483
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001558 | Grad Max: 0.003785
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003531 | Grad Max: 0.006755
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051351 | Grad Max: 0.051351
[GRADIENT NORM TOTAL] 5.9367

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.280
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5524715  0.44752848] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 119/1929 | B: 227/1821 | C: 234/1814
[LOSS Ex1] A: 0.67788 | B: 0.68009 | C: 0.67716
[LOGITS Ex2 A] Mean Abs: 1.235 | Max: 5.066
[LOSS Ex2] A: 0.28100 | B: 0.42811 | C: 0.39771
** [JOINT LOSS] ** : 1.047317
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002696 | Grad Max: 0.071633
  -> Layer: shared_layers.0.bias | Grad Mean: 0.153977 | Grad Max: 0.766755
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002019 | Grad Max: 0.010007
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016817 | Grad Max: 0.016817
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001115 | Grad Max: 0.080677
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020492 | Grad Max: 0.462606
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000251 | Grad Max: 0.006561
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009334 | Grad Max: 0.035382
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000638
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002127 | Grad Max: 0.005121
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000270
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000737 | Grad Max: 0.002041
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001720 | Grad Max: 0.003892
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024444 | Grad Max: 0.024444
[GRADIENT NORM TOTAL] 2.9395

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.313
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50365096 0.49634904] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.016
[MASKS] A(Pass/Fail): 104/1944 | B: 220/1828 | C: 246/1802
[LOSS Ex1] A: 0.68003 | B: 0.67784 | C: 0.67483
[LOGITS Ex2 A] Mean Abs: 1.266 | Max: 4.884
[LOSS Ex2] A: 0.29014 | B: 0.42044 | C: 0.36364
** [JOINT LOSS] ** : 1.035643
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003090 | Grad Max: 0.088830
  -> Layer: shared_layers.0.bias | Grad Mean: 0.191854 | Grad Max: 1.089912
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001967 | Grad Max: 0.008929
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009268 | Grad Max: 0.009268
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001449 | Grad Max: 0.103132
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027769 | Grad Max: 0.585972
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000340 | Grad Max: 0.007929
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012789 | Grad Max: 0.043262
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000746
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002878 | Grad Max: 0.006679
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000360
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001009 | Grad Max: 0.002692
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002094 | Grad Max: 0.005224
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033232 | Grad Max: 0.033232
[GRADIENT NORM TOTAL] 3.8928

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.139
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55033994 0.44966003] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.016
[MASKS] A(Pass/Fail): 112/1936 | B: 201/1655 | C: 181/1195
[LOSS Ex1] A: 0.67872 | B: 0.68000 | C: 0.67394
[LOGITS Ex2 A] Mean Abs: 1.277 | Max: 5.023
[LOSS Ex2] A: 0.31968 | B: 0.41642 | C: 0.37609
** [JOINT LOSS] ** : 1.048287
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004971 | Grad Max: 0.118842
  -> Layer: shared_layers.0.bias | Grad Mean: 0.234652 | Grad Max: 1.271636
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001901 | Grad Max: 0.008545
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004607 | Grad Max: 0.004607
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001820 | Grad Max: 0.112619
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034201 | Grad Max: 0.632753
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000417 | Grad Max: 0.009756
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015478 | Grad Max: 0.054966
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000888
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003514 | Grad Max: 0.008239
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000400
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001223 | Grad Max: 0.003033
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002659 | Grad Max: 0.005231
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039831 | Grad Max: 0.039831
[GRADIENT NORM TOTAL] 4.6561

[EPOCH SUMMARY] Train Loss: 1.0493

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0183 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0260 -> New: 1.0183)

############################## EPOCH 35/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.187
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52984667 0.47015333] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 94/1522 | B: 203/1845 | C: 248/1800
[LOSS Ex1] A: 0.67786 | B: 0.67962 | C: 0.67646
[LOGITS Ex2 A] Mean Abs: 1.279 | Max: 4.915
[LOSS Ex2] A: 0.29040 | B: 0.43804 | C: 0.37076
** [JOINT LOSS] ** : 1.044381
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001563 | Grad Max: 0.030085
  -> Layer: shared_layers.0.bias | Grad Mean: 0.075450 | Grad Max: 0.418441
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001922 | Grad Max: 0.009033
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006612 | Grad Max: 0.006612
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000587 | Grad Max: 0.035372
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010857 | Grad Max: 0.194953
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000139 | Grad Max: 0.005562
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005000 | Grad Max: 0.022008
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000401
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001087 | Grad Max: 0.003004
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000172
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000376 | Grad Max: 0.001103
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000619 | Grad Max: 0.002163
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010686 | Grad Max: 0.010686
[GRADIENT NORM TOTAL] 1.4975

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.314
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50823516 0.49176484] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 130/1918 | B: 228/1820 | C: 277/1771
[LOSS Ex1] A: 0.67815 | B: 0.68000 | C: 0.67330
[LOGITS Ex2 A] Mean Abs: 1.249 | Max: 5.532
[LOSS Ex2] A: 0.29117 | B: 0.43544 | C: 0.36898
** [JOINT LOSS] ** : 1.042346
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006095 | Grad Max: 0.139372
  -> Layer: shared_layers.0.bias | Grad Mean: 0.273130 | Grad Max: 1.433512
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001991 | Grad Max: 0.009033
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008566 | Grad Max: 0.008566
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.133811
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039543 | Grad Max: 0.752272
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000494 | Grad Max: 0.012022
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018337 | Grad Max: 0.068157
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.001042
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004176 | Grad Max: 0.009696
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000477
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001442 | Grad Max: 0.003656
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003219 | Grad Max: 0.005833
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046970 | Grad Max: 0.046970
[GRADIENT NORM TOTAL] 5.2965

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.298
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014685 0.4985315] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 116/1932 | B: 223/1825 | C: 236/1812
[LOSS Ex1] A: 0.67755 | B: 0.67774 | C: 0.67725
[LOGITS Ex2 A] Mean Abs: 1.231 | Max: 5.023
[LOSS Ex2] A: 0.29323 | B: 0.43406 | C: 0.42610
** [JOINT LOSS] ** : 1.061977
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006446 | Grad Max: 0.150730
  -> Layer: shared_layers.0.bias | Grad Mean: 0.353738 | Grad Max: 1.852015
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.009474
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011181 | Grad Max: 0.011181
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002637 | Grad Max: 0.173118
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050257 | Grad Max: 0.973984
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000620 | Grad Max: 0.015506
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023156 | Grad Max: 0.088698
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001204
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005246 | Grad Max: 0.011898
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000648
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001827 | Grad Max: 0.004659
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003988 | Grad Max: 0.007770
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.060012 | Grad Max: 0.060012
[GRADIENT NORM TOTAL] 6.8889

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.305
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5050696 0.4949304] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 129/1919 | B: 205/1651 | C: 256/1792
[LOSS Ex1] A: 0.67665 | B: 0.67991 | C: 0.67440
[LOGITS Ex2 A] Mean Abs: 1.232 | Max: 5.602
[LOSS Ex2] A: 0.30541 | B: 0.41673 | C: 0.39836
** [JOINT LOSS] ** : 1.050486
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003481 | Grad Max: 0.098876
  -> Layer: shared_layers.0.bias | Grad Mean: 0.226729 | Grad Max: 1.224972
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.010110
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011253 | Grad Max: 0.011253
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001644 | Grad Max: 0.113753
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030588 | Grad Max: 0.635706
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000380 | Grad Max: 0.009767
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014227 | Grad Max: 0.054944
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000882
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003203 | Grad Max: 0.008133
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000354
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001105 | Grad Max: 0.002839
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002338 | Grad Max: 0.004439
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036090 | Grad Max: 0.036090
[GRADIENT NORM TOTAL] 4.4433

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.262
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5077407 0.4922593] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 112/1936 | B: 206/1842 | C: 229/1819
[LOSS Ex1] A: 0.67942 | B: 0.67952 | C: 0.67721
[LOGITS Ex2 A] Mean Abs: 1.249 | Max: 4.937
[LOSS Ex2] A: 0.29558 | B: 0.44443 | C: 0.40375
** [JOINT LOSS] ** : 1.059971
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.061418
  -> Layer: shared_layers.0.bias | Grad Mean: 0.106882 | Grad Max: 0.559651
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001778 | Grad Max: 0.007779
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004112 | Grad Max: 0.004112
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000765 | Grad Max: 0.064443
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014378 | Grad Max: 0.360947
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000175 | Grad Max: 0.005788
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006547 | Grad Max: 0.029952
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000429
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001469 | Grad Max: 0.003620
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000202
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000493 | Grad Max: 0.001494
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000785 | Grad Max: 0.002139
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014007 | Grad Max: 0.014007
[GRADIENT NORM TOTAL] 2.0615

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.183
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5136884  0.48631167] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.016
[MASKS] A(Pass/Fail): 95/1953 | B: 230/1818 | C: 233/1815
[LOSS Ex1] A: 0.67897 | B: 0.67990 | C: 0.67725
[LOGITS Ex2 A] Mean Abs: 1.233 | Max: 5.125
[LOSS Ex2] A: 0.30559 | B: 0.43234 | C: 0.37308
** [JOINT LOSS] ** : 1.049042
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004731 | Grad Max: 0.123474
  -> Layer: shared_layers.0.bias | Grad Mean: 0.229077 | Grad Max: 1.168384
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001778 | Grad Max: 0.008168
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005577 | Grad Max: 0.005577
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001705 | Grad Max: 0.122931
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032045 | Grad Max: 0.684206
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000384 | Grad Max: 0.010760
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014311 | Grad Max: 0.052035
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.001022
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003263 | Grad Max: 0.008421
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000375
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001138 | Grad Max: 0.002881
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002411 | Grad Max: 0.005314
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036862 | Grad Max: 0.036862
[GRADIENT NORM TOTAL] 4.4591

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.283
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.553426   0.44657397] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 130/1918 | B: 224/1824 | C: 231/1817
[LOSS Ex1] A: 0.67768 | B: 0.67764 | C: 0.67764
[LOGITS Ex2 A] Mean Abs: 1.256 | Max: 5.045
[LOSS Ex2] A: 0.29977 | B: 0.40909 | C: 0.37523
** [JOINT LOSS] ** : 1.039014
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003689 | Grad Max: 0.102090
  -> Layer: shared_layers.0.bias | Grad Mean: 0.133377 | Grad Max: 0.648451
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.010188
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.022089 | Grad Max: 0.022089
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001013 | Grad Max: 0.103811
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018843 | Grad Max: 0.558659
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000224 | Grad Max: 0.005189
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008319 | Grad Max: 0.027665
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000508
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001912 | Grad Max: 0.004426
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000223
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000664 | Grad Max: 0.001657
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001272 | Grad Max: 0.003792
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020873 | Grad Max: 0.020873
[GRADIENT NORM TOTAL] 2.6221

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.316
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5035428  0.49645725] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 123/1925 | B: 208/1648 | C: 241/1807
[LOSS Ex1] A: 0.67985 | B: 0.67981 | C: 0.67531
[LOGITS Ex2 A] Mean Abs: 1.221 | Max: 5.030
[LOSS Ex2] A: 0.28479 | B: 0.41326 | C: 0.38621
** [JOINT LOSS] ** : 1.039744
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005673 | Grad Max: 0.149941
  -> Layer: shared_layers.0.bias | Grad Mean: 0.278923 | Grad Max: 1.426606
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001796 | Grad Max: 0.008077
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004880 | Grad Max: 0.004880
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.142388
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039315 | Grad Max: 0.777520
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000480 | Grad Max: 0.011507
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017874 | Grad Max: 0.063502
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.001050
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004089 | Grad Max: 0.010115
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000445
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001407 | Grad Max: 0.003543
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003036 | Grad Max: 0.006414
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045080 | Grad Max: 0.045080
[GRADIENT NORM TOTAL] 5.4257

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.142
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5512097  0.44879034] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 128/1920 | B: 208/1840 | C: 239/1809
[LOSS Ex1] A: 0.67852 | B: 0.67942 | C: 0.67608
[LOGITS Ex2 A] Mean Abs: 1.212 | Max: 5.201
[LOSS Ex2] A: 0.30485 | B: 0.44673 | C: 0.39836
** [JOINT LOSS] ** : 1.061323
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006416 | Grad Max: 0.143084
  -> Layer: shared_layers.0.bias | Grad Mean: 0.339486 | Grad Max: 1.778114
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001939 | Grad Max: 0.009216
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009543 | Grad Max: 0.009543
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002573 | Grad Max: 0.169450
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048560 | Grad Max: 0.940086
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000588 | Grad Max: 0.013782
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021975 | Grad Max: 0.078503
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001185
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005002 | Grad Max: 0.011419
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000546
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001739 | Grad Max: 0.004428
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003868 | Grad Max: 0.007210
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057117 | Grad Max: 0.057117
[GRADIENT NORM TOTAL] 6.6589

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.190
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53040534 0.46959463] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 97/1519 | B: 231/1817 | C: 248/1800
[LOSS Ex1] A: 0.67766 | B: 0.67980 | C: 0.67542
[LOGITS Ex2 A] Mean Abs: 1.272 | Max: 5.591
[LOSS Ex2] A: 0.27735 | B: 0.43447 | C: 0.36613
** [JOINT LOSS] ** : 1.036943
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004046 | Grad Max: 0.108586
  -> Layer: shared_layers.0.bias | Grad Mean: 0.189315 | Grad Max: 0.965502
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001928 | Grad Max: 0.008774
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004004 | Grad Max: 0.004004
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001443 | Grad Max: 0.100706
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027008 | Grad Max: 0.576818
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000339 | Grad Max: 0.008150
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012626 | Grad Max: 0.046838
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000745
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002887 | Grad Max: 0.006590
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000350
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000998 | Grad Max: 0.002639
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002312 | Grad Max: 0.005010
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033397 | Grad Max: 0.033397
[GRADIENT NORM TOTAL] 3.7242

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.317
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081723  0.49182773] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 139/1909 | B: 226/1822 | C: 272/1776
[LOSS Ex1] A: 0.67795 | B: 0.67753 | C: 0.67301
[LOGITS Ex2 A] Mean Abs: 1.286 | Max: 6.361
[LOSS Ex2] A: 0.30353 | B: 0.42520 | C: 0.37760
** [JOINT LOSS] ** : 1.044940
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003178 | Grad Max: 0.081341
  -> Layer: shared_layers.0.bias | Grad Mean: 0.201103 | Grad Max: 1.057626
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.009327
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008535 | Grad Max: 0.008535
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001542 | Grad Max: 0.110072
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028976 | Grad Max: 0.615784
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.009273
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013601 | Grad Max: 0.050226
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000797
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003084 | Grad Max: 0.007945
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000363
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001073 | Grad Max: 0.002717
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002243 | Grad Max: 0.004718
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035319 | Grad Max: 0.035319
[GRADIENT NORM TOTAL] 4.0780

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.300
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50163746 0.49836257] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 125/1923 | B: 210/1646 | C: 266/1782
[LOSS Ex1] A: 0.67733 | B: 0.67971 | C: 0.67364
[LOGITS Ex2 A] Mean Abs: 1.292 | Max: 4.953
[LOSS Ex2] A: 0.30004 | B: 0.41187 | C: 0.35928
** [JOINT LOSS] ** : 1.033964
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005810 | Grad Max: 0.144451
  -> Layer: shared_layers.0.bias | Grad Mean: 0.276216 | Grad Max: 1.505029
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001990 | Grad Max: 0.009346
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009014 | Grad Max: 0.009014
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.158805
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040373 | Grad Max: 0.888416
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000498 | Grad Max: 0.012024
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018577 | Grad Max: 0.068374
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.001035
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004217 | Grad Max: 0.009759
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000460
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001462 | Grad Max: 0.003576
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003030 | Grad Max: 0.006027
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047236 | Grad Max: 0.047236
[GRADIENT NORM TOTAL] 5.4737

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.308
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50492465 0.49507535] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 135/1913 | B: 209/1839 | C: 266/1782
[LOSS Ex1] A: 0.67644 | B: 0.67933 | C: 0.67224
[LOGITS Ex2 A] Mean Abs: 1.281 | Max: 5.661
[LOSS Ex2] A: 0.31135 | B: 0.44342 | C: 0.37520
** [JOINT LOSS] ** : 1.052659
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003027 | Grad Max: 0.099241
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134324 | Grad Max: 0.673751
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.008819
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000305 | Grad Max: 0.000305
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001083 | Grad Max: 0.070415
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019928 | Grad Max: 0.403514
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000239 | Grad Max: 0.007091
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008853 | Grad Max: 0.035058
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000552
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002011 | Grad Max: 0.004873
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000240
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000692 | Grad Max: 0.001909
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001383 | Grad Max: 0.003385
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021986 | Grad Max: 0.021986
[GRADIENT NORM TOTAL] 2.6691

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.264
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50762796 0.49237207] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.516 | Std: 0.017
[MASKS] A(Pass/Fail): 117/1931 | B: 234/1814 | C: 161/1215
[LOSS Ex1] A: 0.67926 | B: 0.67971 | C: 0.67595
[LOGITS Ex2 A] Mean Abs: 1.225 | Max: 5.923
[LOSS Ex2] A: 0.29873 | B: 0.42515 | C: 0.39047
** [JOINT LOSS] ** : 1.049757
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004036 | Grad Max: 0.107878
  -> Layer: shared_layers.0.bias | Grad Mean: 0.158279 | Grad Max: 0.839885
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001810 | Grad Max: 0.007849
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007285 | Grad Max: 0.007285
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001218 | Grad Max: 0.090999
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022895 | Grad Max: 0.506899
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000280 | Grad Max: 0.006909
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010282 | Grad Max: 0.035414
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000653
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002339 | Grad Max: 0.005785
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000292
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000814 | Grad Max: 0.002188
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001942 | Grad Max: 0.004225
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027304 | Grad Max: 0.027304
[GRADIENT NORM TOTAL] 3.0835

[EPOCH SUMMARY] Train Loss: 1.0476

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0234 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 36/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.184
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5139389  0.48606113] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.516 | Std: 0.017
[MASKS] A(Pass/Fail): 108/1940 | B: 227/1821 | C: 232/1816
[LOSS Ex1] A: 0.67880 | B: 0.67743 | C: 0.67608
[LOGITS Ex2 A] Mean Abs: 1.185 | Max: 5.532
[LOSS Ex2] A: 0.29922 | B: 0.41500 | C: 0.41483
** [JOINT LOSS] ** : 1.053787
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004926 | Grad Max: 0.121432
  -> Layer: shared_layers.0.bias | Grad Mean: 0.275188 | Grad Max: 1.496322
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001869 | Grad Max: 0.007957
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001144 | Grad Max: 0.001144
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.141121
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039263 | Grad Max: 0.776415
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000492 | Grad Max: 0.012161
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018403 | Grad Max: 0.066386
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000988
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004153 | Grad Max: 0.009627
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000429
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001445 | Grad Max: 0.003614
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003266 | Grad Max: 0.006287
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048589 | Grad Max: 0.048589
[GRADIENT NORM TOTAL] 5.4781

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.285
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55438626 0.44561374] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.017
[MASKS] A(Pass/Fail): 139/1909 | B: 212/1644 | C: 232/1816
[LOSS Ex1] A: 0.67748 | B: 0.67962 | C: 0.67607
[LOGITS Ex2 A] Mean Abs: 1.245 | Max: 5.204
[LOSS Ex2] A: 0.27542 | B: 0.41253 | C: 0.37893
** [JOINT LOSS] ** : 1.033349
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002707 | Grad Max: 0.073690
  -> Layer: shared_layers.0.bias | Grad Mean: 0.108300 | Grad Max: 0.551556
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.009413
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015293 | Grad Max: 0.015293
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000871 | Grad Max: 0.064781
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016040 | Grad Max: 0.360474
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000197 | Grad Max: 0.004897
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007353 | Grad Max: 0.027554
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000486
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001676 | Grad Max: 0.004207
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000224
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000562 | Grad Max: 0.001805
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001149 | Grad Max: 0.003398
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016505 | Grad Max: 0.016505
[GRADIENT NORM TOTAL] 2.1905

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.318
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.503435   0.49656495] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 133/1915 | B: 210/1838 | C: 235/1813
[LOSS Ex1] A: 0.67969 | B: 0.67923 | C: 0.67557
[LOGITS Ex2 A] Mean Abs: 1.286 | Max: 4.677
[LOSS Ex2] A: 0.30635 | B: 0.44438 | C: 0.37888
** [JOINT LOSS] ** : 1.054698
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004389 | Grad Max: 0.096634
  -> Layer: shared_layers.0.bias | Grad Mean: 0.245016 | Grad Max: 1.294692
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001818 | Grad Max: 0.008176
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005974 | Grad Max: 0.005974
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001790 | Grad Max: 0.119895
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034504 | Grad Max: 0.675913
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000427 | Grad Max: 0.009924
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016000 | Grad Max: 0.054855
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000892
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003573 | Grad Max: 0.008255
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000387
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001228 | Grad Max: 0.003182
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002458 | Grad Max: 0.004914
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039258 | Grad Max: 0.039258
[GRADIENT NORM TOTAL] 4.7717

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.144
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55197996 0.44802004] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 139/1909 | B: 235/1813 | C: 279/1769
[LOSS Ex1] A: 0.67835 | B: 0.67962 | C: 0.67351
[LOGITS Ex2 A] Mean Abs: 1.312 | Max: 4.963
[LOSS Ex2] A: 0.31751 | B: 0.44967 | C: 0.37083
** [JOINT LOSS] ** : 1.056496
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007120 | Grad Max: 0.161422
  -> Layer: shared_layers.0.bias | Grad Mean: 0.380309 | Grad Max: 2.020315
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001937 | Grad Max: 0.008580
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005977 | Grad Max: 0.005977
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002886 | Grad Max: 0.211913
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054354 | Grad Max: 1.154563
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000660 | Grad Max: 0.016506
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024773 | Grad Max: 0.090863
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000107 | Grad Max: 0.001214
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005613 | Grad Max: 0.012186
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000585
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001941 | Grad Max: 0.004813
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004138 | Grad Max: 0.007507
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.062837 | Grad Max: 0.062837
[GRADIENT NORM TOTAL] 7.5744

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.193
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53087866 0.46912134] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 104/1512 | B: 228/1820 | C: 252/1796
[LOSS Ex1] A: 0.67748 | B: 0.67733 | C: 0.67539
[LOGITS Ex2 A] Mean Abs: 1.320 | Max: 4.946
[LOSS Ex2] A: 0.27882 | B: 0.41897 | C: 0.37958
** [JOINT LOSS] ** : 1.035863
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004251 | Grad Max: 0.094889
  -> Layer: shared_layers.0.bias | Grad Mean: 0.213091 | Grad Max: 1.146776
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.008965
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003737 | Grad Max: 0.003737
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001632 | Grad Max: 0.117058
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030936 | Grad Max: 0.647550
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000377 | Grad Max: 0.009338
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014190 | Grad Max: 0.053809
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000798
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003216 | Grad Max: 0.007907
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000363
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001108 | Grad Max: 0.002863
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002198 | Grad Max: 0.004890
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035287 | Grad Max: 0.035287
[GRADIENT NORM TOTAL] 4.2376

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.319
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50814706 0.49185294] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 158/1890 | B: 213/1643 | C: 222/1826
[LOSS Ex1] A: 0.67776 | B: 0.67953 | C: 0.67832
[LOGITS Ex2 A] Mean Abs: 1.276 | Max: 6.043
[LOSS Ex2] A: 0.29040 | B: 0.40020 | C: 0.37781
** [JOINT LOSS] ** : 1.034677
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001953 | Grad Max: 0.053544
  -> Layer: shared_layers.0.bias | Grad Mean: 0.124016 | Grad Max: 0.657545
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001963 | Grad Max: 0.009240
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016076 | Grad Max: 0.016076
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000921 | Grad Max: 0.088004
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017086 | Grad Max: 0.502687
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000214 | Grad Max: 0.005921
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008020 | Grad Max: 0.031724
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000589
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001776 | Grad Max: 0.005229
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000226
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000588 | Grad Max: 0.001767
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001164 | Grad Max: 0.003162
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017738 | Grad Max: 0.017738
[GRADIENT NORM TOTAL] 2.5179

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.302
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017598  0.49824017] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 143/1905 | B: 210/1838 | C: 255/1793
[LOSS Ex1] A: 0.67713 | B: 0.67914 | C: 0.67378
[LOGITS Ex2 A] Mean Abs: 1.243 | Max: 5.288
[LOSS Ex2] A: 0.28573 | B: 0.44073 | C: 0.37953
** [JOINT LOSS] ** : 1.045348
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003359 | Grad Max: 0.088366
  -> Layer: shared_layers.0.bias | Grad Mean: 0.206118 | Grad Max: 1.126014
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.009603
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012611 | Grad Max: 0.012611
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001556 | Grad Max: 0.125906
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029398 | Grad Max: 0.716743
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.008947
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013590 | Grad Max: 0.045761
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000767
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003076 | Grad Max: 0.007310
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000375
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001058 | Grad Max: 0.002841
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002287 | Grad Max: 0.004686
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033513 | Grad Max: 0.033513
[GRADIENT NORM TOTAL] 4.1738

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.310
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50483876 0.49516127] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 148/1900 | B: 236/1812 | C: 253/1795
[LOSS Ex1] A: 0.67624 | B: 0.67953 | C: 0.67532
[LOGITS Ex2 A] Mean Abs: 1.261 | Max: 5.581
[LOSS Ex2] A: 0.30440 | B: 0.43428 | C: 0.38089
** [JOINT LOSS] ** : 1.050221
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.057725
  -> Layer: shared_layers.0.bias | Grad Mean: 0.109517 | Grad Max: 0.573164
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001995 | Grad Max: 0.009648
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009611 | Grad Max: 0.009611
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000815 | Grad Max: 0.050640
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014476 | Grad Max: 0.278997
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000169 | Grad Max: 0.005770
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006298 | Grad Max: 0.029548
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000374
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001367 | Grad Max: 0.003672
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000158
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000471 | Grad Max: 0.001298
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001095 | Grad Max: 0.002546
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016475 | Grad Max: 0.016475
[GRADIENT NORM TOTAL] 2.1148

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.266
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5075457  0.49245432] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 132/1916 | B: 228/1820 | C: 223/1825
[LOSS Ex1] A: 0.67910 | B: 0.67723 | C: 0.67638
[LOGITS Ex2 A] Mean Abs: 1.283 | Max: 4.873
[LOSS Ex2] A: 0.30838 | B: 0.41267 | C: 0.36695
** [JOINT LOSS] ** : 1.040237
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003474 | Grad Max: 0.090508
  -> Layer: shared_layers.0.bias | Grad Mean: 0.241078 | Grad Max: 1.272005
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001860 | Grad Max: 0.007980
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003534 | Grad Max: 0.003534
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001712 | Grad Max: 0.147137
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032492 | Grad Max: 0.800623
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000402 | Grad Max: 0.010842
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015118 | Grad Max: 0.060125
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000841
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003399 | Grad Max: 0.007823
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000350
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001176 | Grad Max: 0.002915
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002413 | Grad Max: 0.005470
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038288 | Grad Max: 0.038288
[GRADIENT NORM TOTAL] 4.7784

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.185
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51418275 0.48581725] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 117/1931 | B: 213/1643 | C: 259/1789
[LOSS Ex1] A: 0.67864 | B: 0.67943 | C: 0.67458
[LOGITS Ex2 A] Mean Abs: 1.269 | Max: 5.427
[LOSS Ex2] A: 0.30513 | B: 0.41858 | C: 0.37531
** [JOINT LOSS] ** : 1.043895
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005423 | Grad Max: 0.165969
  -> Layer: shared_layers.0.bias | Grad Mean: 0.315766 | Grad Max: 1.719379
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.008570
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006540 | Grad Max: 0.006540
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002260 | Grad Max: 0.155204
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043127 | Grad Max: 0.875954
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.013126
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020235 | Grad Max: 0.074434
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.001121
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004548 | Grad Max: 0.010600
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000528
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001566 | Grad Max: 0.003980
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003207 | Grad Max: 0.006345
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050092 | Grad Max: 0.050092
[GRADIENT NORM TOTAL] 6.1548

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.288
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55529684 0.44470316] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 148/1900 | B: 212/1836 | C: 271/1777
[LOSS Ex1] A: 0.67728 | B: 0.67904 | C: 0.67252
[LOGITS Ex2 A] Mean Abs: 1.281 | Max: 5.149
[LOSS Ex2] A: 0.28695 | B: 0.43670 | C: 0.37515
** [JOINT LOSS] ** : 1.042546
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003259 | Grad Max: 0.076845
  -> Layer: shared_layers.0.bias | Grad Mean: 0.162561 | Grad Max: 0.871794
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.009947
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012810 | Grad Max: 0.012810
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001193 | Grad Max: 0.101807
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022475 | Grad Max: 0.566475
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.008079
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010701 | Grad Max: 0.047683
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000602
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002409 | Grad Max: 0.005780
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000278
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000828 | Grad Max: 0.002333
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001594 | Grad Max: 0.003697
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025946 | Grad Max: 0.025946
[GRADIENT NORM TOTAL] 3.2152

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.321
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5033245 0.4966755] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 145/1903 | B: 237/1811 | C: 248/1800
[LOSS Ex1] A: 0.67952 | B: 0.67944 | C: 0.67403
[LOGITS Ex2 A] Mean Abs: 1.266 | Max: 4.958
[LOSS Ex2] A: 0.27446 | B: 0.42749 | C: 0.38047
** [JOINT LOSS] ** : 1.038471
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006177 | Grad Max: 0.174581
  -> Layer: shared_layers.0.bias | Grad Mean: 0.240628 | Grad Max: 1.227337
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001831 | Grad Max: 0.007859
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002768 | Grad Max: 0.002768
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001970 | Grad Max: 0.117773
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036133 | Grad Max: 0.685182
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.011862
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015995 | Grad Max: 0.059784
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000851
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003675 | Grad Max: 0.008413
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000397
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001264 | Grad Max: 0.003112
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002818 | Grad Max: 0.005621
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041233 | Grad Max: 0.041233
[GRADIENT NORM TOTAL] 4.7519

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.146
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55277413 0.44722587] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 147/1901 | B: 230/1818 | C: 270/1778
[LOSS Ex1] A: 0.67817 | B: 0.67714 | C: 0.67527
[LOGITS Ex2 A] Mean Abs: 1.260 | Max: 5.148
[LOSS Ex2] A: 0.30453 | B: 0.41302 | C: 0.36701
** [JOINT LOSS] ** : 1.038380
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007194 | Grad Max: 0.159868
  -> Layer: shared_layers.0.bias | Grad Mean: 0.294359 | Grad Max: 1.556589
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.009250
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012213 | Grad Max: 0.012213
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002358 | Grad Max: 0.198900
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043952 | Grad Max: 1.112202
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000529 | Grad Max: 0.013079
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019772 | Grad Max: 0.069388
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.001016
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004516 | Grad Max: 0.009847
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000483
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001547 | Grad Max: 0.003809
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003417 | Grad Max: 0.006690
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050565 | Grad Max: 0.050565
[GRADIENT NORM TOTAL] 5.8872

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.195
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53141266 0.46858734] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.017
[MASKS] A(Pass/Fail): 111/1505 | B: 216/1640 | C: 176/1200
[LOSS Ex1] A: 0.67730 | B: 0.67935 | C: 0.67259
[LOGITS Ex2 A] Mean Abs: 1.274 | Max: 4.854
[LOSS Ex2] A: 0.28379 | B: 0.41307 | C: 0.41816
** [JOINT LOSS] ** : 1.048084
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005417 | Grad Max: 0.145894
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207262 | Grad Max: 1.046324
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001996 | Grad Max: 0.008272
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002261 | Grad Max: 0.002261
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001680 | Grad Max: 0.124984
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030880 | Grad Max: 0.674745
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000383 | Grad Max: 0.009515
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014125 | Grad Max: 0.049476
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000739
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003236 | Grad Max: 0.006965
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000395
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001124 | Grad Max: 0.003278
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002647 | Grad Max: 0.004996
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038076 | Grad Max: 0.038076
[GRADIENT NORM TOTAL] 4.1000

[EPOCH SUMMARY] Train Loss: 1.0440

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0164 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0183 -> New: 1.0164)

############################## EPOCH 37/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.322
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50807244 0.49192753] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.018
[MASKS] A(Pass/Fail): 172/1876 | B: 213/1835 | C: 241/1807
[LOSS Ex1] A: 0.67758 | B: 0.67895 | C: 0.67541
[LOGITS Ex2 A] Mean Abs: 1.325 | Max: 6.232
[LOSS Ex2] A: 0.28962 | B: 0.44537 | C: 0.37087
** [JOINT LOSS] ** : 1.045933
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003298 | Grad Max: 0.073755
  -> Layer: shared_layers.0.bias | Grad Mean: 0.194683 | Grad Max: 1.030853
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001976 | Grad Max: 0.009231
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011654 | Grad Max: 0.011654
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001421 | Grad Max: 0.105620
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026897 | Grad Max: 0.588506
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000334 | Grad Max: 0.008185
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012638 | Grad Max: 0.046387
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000736
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002839 | Grad Max: 0.006741
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000324
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000974 | Grad Max: 0.002499
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001927 | Grad Max: 0.004268
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031068 | Grad Max: 0.031068
[GRADIENT NORM TOTAL] 3.8182

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.305
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50192165 0.49807835] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.018
[MASKS] A(Pass/Fail): 150/1898 | B: 239/1809 | C: 242/1806
[LOSS Ex1] A: 0.67694 | B: 0.67935 | C: 0.67489
[LOGITS Ex2 A] Mean Abs: 1.315 | Max: 4.916
[LOSS Ex2] A: 0.29578 | B: 0.42771 | C: 0.38293
** [JOINT LOSS] ** : 1.045865
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005788 | Grad Max: 0.150625
  -> Layer: shared_layers.0.bias | Grad Mean: 0.307069 | Grad Max: 1.631666
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.009497
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012383 | Grad Max: 0.012383
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.188430
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043043 | Grad Max: 1.037250
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000525 | Grad Max: 0.014006
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019927 | Grad Max: 0.075805
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001068
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004490 | Grad Max: 0.010405
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000462
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001536 | Grad Max: 0.003853
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003052 | Grad Max: 0.006041
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048466 | Grad Max: 0.048466
[GRADIENT NORM TOTAL] 6.0411

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.313
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5047155  0.49528447] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.018
[MASKS] A(Pass/Fail): 157/1891 | B: 234/1814 | C: 256/1792
[LOSS Ex1] A: 0.67605 | B: 0.67704 | C: 0.67379
[LOGITS Ex2 A] Mean Abs: 1.294 | Max: 5.183
[LOSS Ex2] A: 0.29794 | B: 0.42009 | C: 0.38327
** [JOINT LOSS] ** : 1.042727
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004457 | Grad Max: 0.145948
  -> Layer: shared_layers.0.bias | Grad Mean: 0.159860 | Grad Max: 0.832347
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.009548
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006169 | Grad Max: 0.006169
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001336 | Grad Max: 0.087666
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024643 | Grad Max: 0.446413
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000300 | Grad Max: 0.007554
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011238 | Grad Max: 0.036524
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000697
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002566 | Grad Max: 0.006711
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000316
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000880 | Grad Max: 0.002405
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001744 | Grad Max: 0.003954
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027709 | Grad Max: 0.027709
[GRADIENT NORM TOTAL] 3.2030

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.268
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50745606 0.49254394] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 142/1906 | B: 217/1639 | C: 240/1808
[LOSS Ex1] A: 0.67894 | B: 0.67926 | C: 0.67616
[LOGITS Ex2 A] Mean Abs: 1.229 | Max: 5.066
[LOSS Ex2] A: 0.29064 | B: 0.40624 | C: 0.39409
** [JOINT LOSS] ** : 1.041782
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004421 | Grad Max: 0.086008
  -> Layer: shared_layers.0.bias | Grad Mean: 0.195523 | Grad Max: 0.967803
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001822 | Grad Max: 0.008174
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008061 | Grad Max: 0.008061
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001458 | Grad Max: 0.139209
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026864 | Grad Max: 0.795686
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000330 | Grad Max: 0.007421
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012358 | Grad Max: 0.039266
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000704
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002814 | Grad Max: 0.006310
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000321
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000961 | Grad Max: 0.002648
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002163 | Grad Max: 0.004443
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031197 | Grad Max: 0.031197
[GRADIENT NORM TOTAL] 3.8051

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.186
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51442754 0.48557252] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 126/1922 | B: 214/1834 | C: 270/1778
[LOSS Ex1] A: 0.67849 | B: 0.67886 | C: 0.67325
[LOGITS Ex2 A] Mean Abs: 1.195 | Max: 5.000
[LOSS Ex2] A: 0.29214 | B: 0.45236 | C: 0.37967
** [JOINT LOSS] ** : 1.051593
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004808 | Grad Max: 0.132107
  -> Layer: shared_layers.0.bias | Grad Mean: 0.337121 | Grad Max: 1.720214
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001913 | Grad Max: 0.008457
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000963 | Grad Max: 0.000963
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002395 | Grad Max: 0.232237
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045513 | Grad Max: 1.311239
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000558 | Grad Max: 0.012616
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021171 | Grad Max: 0.070996
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001068
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004756 | Grad Max: 0.011058
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000466
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001628 | Grad Max: 0.003907
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003379 | Grad Max: 0.006451
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051518 | Grad Max: 0.051518
[GRADIENT NORM TOTAL] 6.7107

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.290
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.556142   0.44385803] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 157/1891 | B: 244/1804 | C: 251/1797
[LOSS Ex1] A: 0.67710 | B: 0.67926 | C: 0.67424
[LOGITS Ex2 A] Mean Abs: 1.261 | Max: 5.121
[LOSS Ex2] A: 0.27393 | B: 0.42432 | C: 0.37446
** [JOINT LOSS] ** : 1.034434
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.069340
  -> Layer: shared_layers.0.bias | Grad Mean: 0.147570 | Grad Max: 0.785543
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001995 | Grad Max: 0.009087
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011036 | Grad Max: 0.011036
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001077 | Grad Max: 0.073411
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020260 | Grad Max: 0.401177
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000252 | Grad Max: 0.006890
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009593 | Grad Max: 0.037797
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000538
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002163 | Grad Max: 0.005308
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000275
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000739 | Grad Max: 0.002228
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001614 | Grad Max: 0.003750
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024025 | Grad Max: 0.024025
[GRADIENT NORM TOTAL] 2.9453

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.324
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5032351  0.49676493] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.018
[MASKS] A(Pass/Fail): 153/1895 | B: 234/1814 | C: 248/1800
[LOSS Ex1] A: 0.67937 | B: 0.67695 | C: 0.67469
[LOGITS Ex2 A] Mean Abs: 1.293 | Max: 5.254
[LOSS Ex2] A: 0.28026 | B: 0.41383 | C: 0.37032
** [JOINT LOSS] ** : 1.031806
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003040 | Grad Max: 0.082154
  -> Layer: shared_layers.0.bias | Grad Mean: 0.191184 | Grad Max: 1.022766
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001955 | Grad Max: 0.008594
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010646 | Grad Max: 0.010646
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001388 | Grad Max: 0.091581
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026708 | Grad Max: 0.524436
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000329 | Grad Max: 0.009720
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012561 | Grad Max: 0.049560
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000705
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002812 | Grad Max: 0.006716
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000312
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000965 | Grad Max: 0.002454
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001848 | Grad Max: 0.004383
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030538 | Grad Max: 0.030538
[GRADIENT NORM TOTAL] 3.7632

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.148
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5534702  0.44652978] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.018
[MASKS] A(Pass/Fail): 155/1893 | B: 219/1637 | C: 256/1792
[LOSS Ex1] A: 0.67801 | B: 0.67917 | C: 0.67536
[LOGITS Ex2 A] Mean Abs: 1.306 | Max: 4.832
[LOSS Ex2] A: 0.29982 | B: 0.42016 | C: 0.38436
** [JOINT LOSS] ** : 1.045624
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005210 | Grad Max: 0.121412
  -> Layer: shared_layers.0.bias | Grad Mean: 0.268739 | Grad Max: 1.426693
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001828 | Grad Max: 0.008178
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003450 | Grad Max: 0.003450
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001986 | Grad Max: 0.132098
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037480 | Grad Max: 0.744844
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000455 | Grad Max: 0.011259
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017174 | Grad Max: 0.054558
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000832
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003873 | Grad Max: 0.008540
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000412
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001326 | Grad Max: 0.003564
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002818 | Grad Max: 0.005579
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042915 | Grad Max: 0.042915
[GRADIENT NORM TOTAL] 5.2243

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.198
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5318717  0.46812835] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 121/1495 | B: 217/1831 | C: 271/1777
[LOSS Ex1] A: 0.67713 | B: 0.67877 | C: 0.67296
[LOGITS Ex2 A] Mean Abs: 1.329 | Max: 4.906
[LOSS Ex2] A: 0.27423 | B: 0.43251 | C: 0.35252
** [JOINT LOSS] ** : 1.029372
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001758 | Grad Max: 0.037446
  -> Layer: shared_layers.0.bias | Grad Mean: 0.070263 | Grad Max: 0.264036
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001989 | Grad Max: 0.008802
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002424 | Grad Max: 0.002424
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000533 | Grad Max: 0.067574
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009824 | Grad Max: 0.385452
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000105 | Grad Max: 0.004154
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003939 | Grad Max: 0.019615
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000317
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000890 | Grad Max: 0.002630
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000138
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000305 | Grad Max: 0.001047
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000592 | Grad Max: 0.002172
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009298 | Grad Max: 0.009298
[GRADIENT NORM TOTAL] 1.4018

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.325
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080243  0.49197572] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 177/1871 | B: 246/1802 | C: 245/1803
[LOSS Ex1] A: 0.67740 | B: 0.67917 | C: 0.67422
[LOGITS Ex2 A] Mean Abs: 1.295 | Max: 5.370
[LOSS Ex2] A: 0.28497 | B: 0.43303 | C: 0.37665
** [JOINT LOSS] ** : 1.041811
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004953 | Grad Max: 0.130297
  -> Layer: shared_layers.0.bias | Grad Mean: 0.262305 | Grad Max: 1.391007
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001913 | Grad Max: 0.008214
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003003 | Grad Max: 0.003003
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.136117
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037250 | Grad Max: 0.771709
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000467 | Grad Max: 0.011469
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017683 | Grad Max: 0.062424
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000838
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003978 | Grad Max: 0.009090
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000391
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001355 | Grad Max: 0.003347
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002786 | Grad Max: 0.004970
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042893 | Grad Max: 0.042893
[GRADIENT NORM TOTAL] 5.1481

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.307
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5020337 0.4979663] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 159/1889 | B: 236/1812 | C: 250/1798
[LOSS Ex1] A: 0.67674 | B: 0.67685 | C: 0.67423
[LOGITS Ex2 A] Mean Abs: 1.271 | Max: 4.991
[LOSS Ex2] A: 0.29415 | B: 0.42358 | C: 0.38651
** [JOINT LOSS] ** : 1.044018
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005549 | Grad Max: 0.123255
  -> Layer: shared_layers.0.bias | Grad Mean: 0.305819 | Grad Max: 1.548545
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.009433
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011216 | Grad Max: 0.011216
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.152736
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041791 | Grad Max: 0.869083
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000517 | Grad Max: 0.011594
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019504 | Grad Max: 0.068430
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000974
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004400 | Grad Max: 0.009969
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000471
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001506 | Grad Max: 0.003678
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003192 | Grad Max: 0.006363
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048509 | Grad Max: 0.048509
[GRADIENT NORM TOTAL] 5.8589

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.315
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5046567 0.4953433] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 166/1882 | B: 221/1635 | C: 232/1816
[LOSS Ex1] A: 0.67585 | B: 0.67907 | C: 0.67592
[LOGITS Ex2 A] Mean Abs: 1.259 | Max: 5.261
[LOSS Ex2] A: 0.29228 | B: 0.40103 | C: 0.37010
** [JOINT LOSS] ** : 1.031420
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002605 | Grad Max: 0.064743
  -> Layer: shared_layers.0.bias | Grad Mean: 0.151883 | Grad Max: 0.785398
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.010076
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014324 | Grad Max: 0.014324
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001064 | Grad Max: 0.087506
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019626 | Grad Max: 0.490420
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000242 | Grad Max: 0.006747
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009201 | Grad Max: 0.037587
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000558
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002058 | Grad Max: 0.005414
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000240
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000690 | Grad Max: 0.001868
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001396 | Grad Max: 0.003336
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021301 | Grad Max: 0.021301
[GRADIENT NORM TOTAL] 2.8982

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.270
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074064 0.4925936] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 149/1899 | B: 219/1829 | C: 253/1795
[LOSS Ex1] A: 0.67878 | B: 0.67867 | C: 0.67464
[LOGITS Ex2 A] Mean Abs: 1.289 | Max: 5.101
[LOSS Ex2] A: 0.28741 | B: 0.45646 | C: 0.36895
** [JOINT LOSS] ** : 1.048304
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004453 | Grad Max: 0.104131
  -> Layer: shared_layers.0.bias | Grad Mean: 0.255662 | Grad Max: 1.351923
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001869 | Grad Max: 0.008069
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006867 | Grad Max: 0.006867
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001877 | Grad Max: 0.122769
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035453 | Grad Max: 0.694595
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000438 | Grad Max: 0.011523
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016658 | Grad Max: 0.067397
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000932
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003748 | Grad Max: 0.009666
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000403
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001277 | Grad Max: 0.003237
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002454 | Grad Max: 0.005039
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039881 | Grad Max: 0.039881
[GRADIENT NORM TOTAL] 5.0237

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.188
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5146518  0.48534822] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.017
[MASKS] A(Pass/Fail): 138/1910 | B: 248/1800 | C: 164/1212
[LOSS Ex1] A: 0.67832 | B: 0.67907 | C: 0.67653
[LOGITS Ex2 A] Mean Abs: 1.292 | Max: 5.172
[LOSS Ex2] A: 0.30018 | B: 0.43565 | C: 0.38199
** [JOINT LOSS] ** : 1.050584
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006964 | Grad Max: 0.180701
  -> Layer: shared_layers.0.bias | Grad Mean: 0.330869 | Grad Max: 1.743767
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001786 | Grad Max: 0.008078
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004071 | Grad Max: 0.004071
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002487 | Grad Max: 0.179190
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046922 | Grad Max: 1.034956
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000574 | Grad Max: 0.014398
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021630 | Grad Max: 0.077025
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001172
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004871 | Grad Max: 0.011575
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000483
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001655 | Grad Max: 0.004059
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003360 | Grad Max: 0.006316
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051533 | Grad Max: 0.051533
[GRADIENT NORM TOTAL] 6.5052

[EPOCH SUMMARY] Train Loss: 1.0418

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0149 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0164 -> New: 1.0149)

############################## EPOCH 38/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.293
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5570397  0.44296032] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 171/1877 | B: 237/1811 | C: 270/1778
[LOSS Ex1] A: 0.67689 | B: 0.67674 | C: 0.67368
[LOGITS Ex2 A] Mean Abs: 1.319 | Max: 5.317
[LOSS Ex2] A: 0.27961 | B: 0.42083 | C: 0.37828
** [JOINT LOSS] ** : 1.035342
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004073 | Grad Max: 0.101231
  -> Layer: shared_layers.0.bias | Grad Mean: 0.243084 | Grad Max: 1.290912
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.009686
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014463 | Grad Max: 0.014463
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001764 | Grad Max: 0.149091
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033541 | Grad Max: 0.841282
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000419 | Grad Max: 0.010714
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015832 | Grad Max: 0.060737
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000866
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003531 | Grad Max: 0.008568
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000401
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001210 | Grad Max: 0.002987
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002486 | Grad Max: 0.005244
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038863 | Grad Max: 0.038863
[GRADIENT NORM TOTAL] 4.8480

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.327
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50317013 0.49682987] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 160/1888 | B: 222/1634 | C: 265/1783
[LOSS Ex1] A: 0.67919 | B: 0.67898 | C: 0.67320
[LOGITS Ex2 A] Mean Abs: 1.279 | Max: 5.349
[LOSS Ex2] A: 0.27753 | B: 0.40114 | C: 0.39298
** [JOINT LOSS] ** : 1.034339
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003465 | Grad Max: 0.119895
  -> Layer: shared_layers.0.bias | Grad Mean: 0.119797 | Grad Max: 0.600078
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001869 | Grad Max: 0.008536
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008002 | Grad Max: 0.008002
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001001 | Grad Max: 0.057558
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017911 | Grad Max: 0.326519
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.005331
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007701 | Grad Max: 0.027036
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000488
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001794 | Grad Max: 0.004626
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000233
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000618 | Grad Max: 0.001730
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001473 | Grad Max: 0.003209
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020775 | Grad Max: 0.020775
[GRADIENT NORM TOTAL] 2.3561

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.150
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5542421  0.44575796] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 167/1881 | B: 221/1827 | C: 252/1796
[LOSS Ex1] A: 0.67781 | B: 0.67858 | C: 0.67513
[LOGITS Ex2 A] Mean Abs: 1.273 | Max: 5.531
[LOSS Ex2] A: 0.28893 | B: 0.44438 | C: 0.37887
** [JOINT LOSS] ** : 1.047905
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003789 | Grad Max: 0.094703
  -> Layer: shared_layers.0.bias | Grad Mean: 0.211983 | Grad Max: 1.124358
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001879 | Grad Max: 0.008484
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005563 | Grad Max: 0.005563
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001642 | Grad Max: 0.097210
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030942 | Grad Max: 0.561610
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000378 | Grad Max: 0.008144
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014323 | Grad Max: 0.048248
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000837
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003233 | Grad Max: 0.007964
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000349
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001105 | Grad Max: 0.002845
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002391 | Grad Max: 0.004793
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035773 | Grad Max: 0.035773
[GRADIENT NORM TOTAL] 4.2508

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.200
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5323924  0.46760762] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 132/1484 | B: 249/1799 | C: 258/1790
[LOSS Ex1] A: 0.67693 | B: 0.67898 | C: 0.67488
[LOGITS Ex2 A] Mean Abs: 1.320 | Max: 5.895
[LOSS Ex2] A: 0.27477 | B: 0.42279 | C: 0.35313
** [JOINT LOSS] ** : 1.027163
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002274 | Grad Max: 0.052841
  -> Layer: shared_layers.0.bias | Grad Mean: 0.087463 | Grad Max: 0.489496
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.008643
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004136 | Grad Max: 0.004136
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000722 | Grad Max: 0.043029
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013431 | Grad Max: 0.242410
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000169 | Grad Max: 0.004458
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006364 | Grad Max: 0.023699
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000434
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001432 | Grad Max: 0.003651
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000187
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000477 | Grad Max: 0.001500
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001141 | Grad Max: 0.003184
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015560 | Grad Max: 0.015560
[GRADIENT NORM TOTAL] 1.7995

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.328
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5079692 0.4920308] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 186/1862 | B: 240/1808 | C: 229/1819
[LOSS Ex1] A: 0.67720 | B: 0.67664 | C: 0.67559
[LOGITS Ex2 A] Mean Abs: 1.346 | Max: 6.928
[LOSS Ex2] A: 0.30602 | B: 0.42847 | C: 0.36728
** [JOINT LOSS] ** : 1.043736
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006215 | Grad Max: 0.149922
  -> Layer: shared_layers.0.bias | Grad Mean: 0.330675 | Grad Max: 1.732458
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001994 | Grad Max: 0.009068
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010415 | Grad Max: 0.010415
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002436 | Grad Max: 0.164000
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046195 | Grad Max: 0.920168
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000554 | Grad Max: 0.012689
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021004 | Grad Max: 0.073064
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001138
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004737 | Grad Max: 0.010976
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000509
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001619 | Grad Max: 0.003945
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003311 | Grad Max: 0.006266
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051502 | Grad Max: 0.051502
[GRADIENT NORM TOTAL] 6.4764

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.310
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5021597  0.49784032] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 174/1874 | B: 224/1632 | C: 241/1807
[LOSS Ex1] A: 0.67653 | B: 0.67888 | C: 0.67384
[LOGITS Ex2 A] Mean Abs: 1.346 | Max: 5.258
[LOSS Ex2] A: 0.30622 | B: 0.42884 | C: 0.38105
** [JOINT LOSS] ** : 1.048453
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008243 | Grad Max: 0.201307
  -> Layer: shared_layers.0.bias | Grad Mean: 0.419588 | Grad Max: 2.288453
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002009 | Grad Max: 0.009343
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010372 | Grad Max: 0.010372
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003204 | Grad Max: 0.208186
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060903 | Grad Max: 1.140034
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000741 | Grad Max: 0.017857
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028110 | Grad Max: 0.101718
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000118 | Grad Max: 0.001316
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006324 | Grad Max: 0.013664
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.000649
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002152 | Grad Max: 0.005208
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004473 | Grad Max: 0.008056
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.068655 | Grad Max: 0.068655
[GRADIENT NORM TOTAL] 8.3755

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.318
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50456184 0.49543813] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 176/1872 | B: 221/1827 | C: 257/1791
[LOSS Ex1] A: 0.67564 | B: 0.67848 | C: 0.67515
[LOGITS Ex2 A] Mean Abs: 1.343 | Max: 6.539
[LOSS Ex2] A: 0.31053 | B: 0.45234 | C: 0.38682
** [JOINT LOSS] ** : 1.059652
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007379 | Grad Max: 0.204467
  -> Layer: shared_layers.0.bias | Grad Mean: 0.307654 | Grad Max: 1.606939
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.009672
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011397 | Grad Max: 0.011397
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002456 | Grad Max: 0.148919
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045764 | Grad Max: 0.850356
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000550 | Grad Max: 0.012197
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020699 | Grad Max: 0.067522
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.001128
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004692 | Grad Max: 0.011279
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000483
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001579 | Grad Max: 0.004035
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003166 | Grad Max: 0.005953
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.049357 | Grad Max: 0.049357
[GRADIENT NORM TOTAL] 6.1507

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.272
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073436  0.49265638] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.018
[MASKS] A(Pass/Fail): 157/1891 | B: 249/1799 | C: 246/1802
[LOSS Ex1] A: 0.67861 | B: 0.67889 | C: 0.67380
[LOGITS Ex2 A] Mean Abs: 1.278 | Max: 5.181
[LOSS Ex2] A: 0.28278 | B: 0.42301 | C: 0.37972
** [JOINT LOSS] ** : 1.038937
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.035486
  -> Layer: shared_layers.0.bias | Grad Mean: 0.035075 | Grad Max: 0.241827
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001832 | Grad Max: 0.007252
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002201 | Grad Max: 0.002201
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000336 | Grad Max: 0.038624
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005818 | Grad Max: 0.216635
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.002838
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002460 | Grad Max: 0.011877
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000213
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000553 | Grad Max: 0.002072
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000117
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000195 | Grad Max: 0.000670
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000727 | Grad Max: 0.002156
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008249 | Grad Max: 0.008249
[GRADIENT NORM TOTAL] 0.7841

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.189
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51484597 0.48515403] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.018
[MASKS] A(Pass/Fail): 150/1898 | B: 241/1807 | C: 254/1794
[LOSS Ex1] A: 0.67814 | B: 0.67654 | C: 0.67427
[LOGITS Ex2 A] Mean Abs: 1.224 | Max: 5.174
[LOSS Ex2] A: 0.30574 | B: 0.41341 | C: 0.37008
** [JOINT LOSS] ** : 1.039396
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003446 | Grad Max: 0.074239
  -> Layer: shared_layers.0.bias | Grad Mean: 0.190360 | Grad Max: 0.922417
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001959 | Grad Max: 0.008575
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004917 | Grad Max: 0.004917
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001356 | Grad Max: 0.172356
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025350 | Grad Max: 0.974774
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.007746
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011393 | Grad Max: 0.045577
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000728
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002562 | Grad Max: 0.006627
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000298
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000857 | Grad Max: 0.002328
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001783 | Grad Max: 0.003972
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026973 | Grad Max: 0.026973
[GRADIENT NORM TOTAL] 3.8029

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.296
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55797577 0.44202417] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 185/1863 | B: 227/1629 | C: 244/1804
[LOSS Ex1] A: 0.67668 | B: 0.67879 | C: 0.67464
[LOGITS Ex2 A] Mean Abs: 1.276 | Max: 4.977
[LOSS Ex2] A: 0.27479 | B: 0.40557 | C: 0.36399
** [JOINT LOSS] ** : 1.024820
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001632 | Grad Max: 0.032204
  -> Layer: shared_layers.0.bias | Grad Mean: 0.087561 | Grad Max: 0.406590
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.009585
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018918 | Grad Max: 0.018918
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000628 | Grad Max: 0.113106
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011073 | Grad Max: 0.639460
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.003857
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004368 | Grad Max: 0.019942
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000294
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000955 | Grad Max: 0.002853
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000144
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000309 | Grad Max: 0.001187
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000692 | Grad Max: 0.002266
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008864 | Grad Max: 0.008864
[GRADIENT NORM TOTAL] 1.8334

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.330
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50311506 0.49688497] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 170/1878 | B: 222/1826 | C: 257/1791
[LOSS Ex1] A: 0.67901 | B: 0.67837 | C: 0.67400
[LOGITS Ex2 A] Mean Abs: 1.315 | Max: 4.740
[LOSS Ex2] A: 0.27249 | B: 0.43941 | C: 0.35633
** [JOINT LOSS] ** : 1.033203
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003786 | Grad Max: 0.095272
  -> Layer: shared_layers.0.bias | Grad Mean: 0.160657 | Grad Max: 0.898239
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001955 | Grad Max: 0.008843
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013019 | Grad Max: 0.013019
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001214 | Grad Max: 0.071034
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023418 | Grad Max: 0.390273
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000288 | Grad Max: 0.007450
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010925 | Grad Max: 0.036798
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000679
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002434 | Grad Max: 0.006004
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000292
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000827 | Grad Max: 0.002150
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001582 | Grad Max: 0.003799
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026459 | Grad Max: 0.026459
[GRADIENT NORM TOTAL] 3.1570

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.152
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55504173 0.44495824] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 182/1866 | B: 250/1798 | C: 228/1820
[LOSS Ex1] A: 0.67761 | B: 0.67877 | C: 0.67616
[LOGITS Ex2 A] Mean Abs: 1.319 | Max: 5.481
[LOSS Ex2] A: 0.29173 | B: 0.42960 | C: 0.39219
** [JOINT LOSS] ** : 1.048687
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.050820
  -> Layer: shared_layers.0.bias | Grad Mean: 0.082961 | Grad Max: 0.457914
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001804 | Grad Max: 0.007459
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003851 | Grad Max: 0.003851
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000674 | Grad Max: 0.056316
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012000 | Grad Max: 0.297301
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.003841
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004837 | Grad Max: 0.019884
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000313
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001095 | Grad Max: 0.003267
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000155
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000362 | Grad Max: 0.001166
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000549 | Grad Max: 0.001593
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009859 | Grad Max: 0.009859
[GRADIENT NORM TOTAL] 1.6644

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.203
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53288484 0.46711513] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 145/1471 | B: 246/1802 | C: 251/1797
[LOSS Ex1] A: 0.67671 | B: 0.67641 | C: 0.67383
[LOGITS Ex2 A] Mean Abs: 1.333 | Max: 5.787
[LOSS Ex2] A: 0.27706 | B: 0.41767 | C: 0.35579
** [JOINT LOSS] ** : 1.025823
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003144 | Grad Max: 0.078857
  -> Layer: shared_layers.0.bias | Grad Mean: 0.138267 | Grad Max: 0.705285
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.009274
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005373 | Grad Max: 0.005373
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001049 | Grad Max: 0.072856
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019143 | Grad Max: 0.410059
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000227 | Grad Max: 0.006169
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008558 | Grad Max: 0.030934
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000478
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001958 | Grad Max: 0.004485
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000228
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000659 | Grad Max: 0.001677
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001498 | Grad Max: 0.003391
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021473 | Grad Max: 0.021473
[GRADIENT NORM TOTAL] 2.7265

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.331
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50797945 0.49202058] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 195/1853 | B: 227/1629 | C: 175/1201
[LOSS Ex1] A: 0.67697 | B: 0.67866 | C: 0.67189
[LOGITS Ex2 A] Mean Abs: 1.329 | Max: 5.634
[LOSS Ex2] A: 0.27567 | B: 0.40664 | C: 0.35761
** [JOINT LOSS] ** : 1.022477
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003318 | Grad Max: 0.096869
  -> Layer: shared_layers.0.bias | Grad Mean: 0.099935 | Grad Max: 0.485266
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002013 | Grad Max: 0.009078
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009701 | Grad Max: 0.009701
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000783 | Grad Max: 0.060454
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014102 | Grad Max: 0.303901
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000164 | Grad Max: 0.004348
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006220 | Grad Max: 0.022694
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000417
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001430 | Grad Max: 0.004019
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000236
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000468 | Grad Max: 0.001670
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000912 | Grad Max: 0.002603
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013557 | Grad Max: 0.013557
[GRADIENT NORM TOTAL] 1.9432

[EPOCH SUMMARY] Train Loss: 1.0379

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0083 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0149 -> New: 1.0083)

############################## EPOCH 39/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.313
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50224197 0.49775803] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 187/1861 | B: 225/1823 | C: 226/1822
[LOSS Ex1] A: 0.67627 | B: 0.67824 | C: 0.67599
[LOGITS Ex2 A] Mean Abs: 1.334 | Max: 4.666
[LOSS Ex2] A: 0.27509 | B: 0.43576 | C: 0.37658
** [JOINT LOSS] ** : 1.039312
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002823 | Grad Max: 0.070684
  -> Layer: shared_layers.0.bias | Grad Mean: 0.151383 | Grad Max: 0.795545
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.009897
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016292 | Grad Max: 0.016292
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001139 | Grad Max: 0.095704
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021748 | Grad Max: 0.532723
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000263 | Grad Max: 0.006787
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010067 | Grad Max: 0.034554
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000527
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002245 | Grad Max: 0.005288
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000254
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000752 | Grad Max: 0.001941
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001342 | Grad Max: 0.003083
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023180 | Grad Max: 0.023180
[GRADIENT NORM TOTAL] 3.0440

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.322
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.504523 0.495477] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 196/1852 | B: 251/1797 | C: 233/1815
[LOSS Ex1] A: 0.67536 | B: 0.67863 | C: 0.67634
[LOGITS Ex2 A] Mean Abs: 1.332 | Max: 5.457
[LOSS Ex2] A: 0.29055 | B: 0.42453 | C: 0.38044
** [JOINT LOSS] ** : 1.041952
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003809 | Grad Max: 0.107234
  -> Layer: shared_layers.0.bias | Grad Mean: 0.128149 | Grad Max: 0.570515
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001985 | Grad Max: 0.009439
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011854 | Grad Max: 0.011854
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001100 | Grad Max: 0.093804
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019932 | Grad Max: 0.530769
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000239 | Grad Max: 0.005832
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008893 | Grad Max: 0.035508
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000511
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002014 | Grad Max: 0.005322
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000230
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000679 | Grad Max: 0.001811
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001268 | Grad Max: 0.003130
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020932 | Grad Max: 0.020932
[GRADIENT NORM TOTAL] 2.6118

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.275
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073177  0.49268225] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.517 | Std: 0.018
[MASKS] A(Pass/Fail): 170/1878 | B: 248/1800 | C: 253/1795
[LOSS Ex1] A: 0.67837 | B: 0.67626 | C: 0.67374
[LOGITS Ex2 A] Mean Abs: 1.279 | Max: 5.051
[LOSS Ex2] A: 0.27809 | B: 0.41300 | C: 0.37414
** [JOINT LOSS] ** : 1.031201
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003386 | Grad Max: 0.084104
  -> Layer: shared_layers.0.bias | Grad Mean: 0.132841 | Grad Max: 0.775164
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001927 | Grad Max: 0.007288
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004024 | Grad Max: 0.004024
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001070 | Grad Max: 0.107776
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019810 | Grad Max: 0.606468
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000239 | Grad Max: 0.005832
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009129 | Grad Max: 0.035020
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000548
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002072 | Grad Max: 0.005479
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000245
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000690 | Grad Max: 0.001956
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001510 | Grad Max: 0.003633
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021906 | Grad Max: 0.021906
[GRADIENT NORM TOTAL] 2.7918

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.191
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5151163  0.48488367] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.517 | Std: 0.018
[MASKS] A(Pass/Fail): 160/1888 | B: 230/1626 | C: 239/1809
[LOSS Ex1] A: 0.67790 | B: 0.67852 | C: 0.67406
[LOGITS Ex2 A] Mean Abs: 1.262 | Max: 5.598
[LOSS Ex2] A: 0.28214 | B: 0.40254 | C: 0.37509
** [JOINT LOSS] ** : 1.030082
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003702 | Grad Max: 0.078339
  -> Layer: shared_layers.0.bias | Grad Mean: 0.168003 | Grad Max: 0.863160
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001818 | Grad Max: 0.007666
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002338 | Grad Max: 0.002338
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001291 | Grad Max: 0.088398
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024066 | Grad Max: 0.508183
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000292 | Grad Max: 0.006895
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011086 | Grad Max: 0.039754
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000702
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002497 | Grad Max: 0.006576
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000291
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000828 | Grad Max: 0.002134
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001756 | Grad Max: 0.003916
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026002 | Grad Max: 0.026002
[GRADIENT NORM TOTAL] 3.3051

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.299
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5592191 0.4407809] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.019
[MASKS] A(Pass/Fail): 197/1851 | B: 231/1817 | C: 263/1785
[LOSS Ex1] A: 0.67639 | B: 0.67810 | C: 0.67243
[LOGITS Ex2 A] Mean Abs: 1.328 | Max: 5.146
[LOSS Ex2] A: 0.27583 | B: 0.43232 | C: 0.34470
** [JOINT LOSS] ** : 1.026587
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003042 | Grad Max: 0.072090
  -> Layer: shared_layers.0.bias | Grad Mean: 0.129853 | Grad Max: 0.629956
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.008966
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010019 | Grad Max: 0.010019
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000962 | Grad Max: 0.098994
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017834 | Grad Max: 0.534929
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000215 | Grad Max: 0.005735
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008163 | Grad Max: 0.028009
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000463
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001831 | Grad Max: 0.004541
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000220
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000619 | Grad Max: 0.001755
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001158 | Grad Max: 0.003179
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019265 | Grad Max: 0.019265
[GRADIENT NORM TOTAL] 2.5507

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.334
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5030554  0.49694455] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 176/1872 | B: 255/1793 | C: 272/1776
[LOSS Ex1] A: 0.67876 | B: 0.67849 | C: 0.67318
[LOGITS Ex2 A] Mean Abs: 1.325 | Max: 5.376
[LOSS Ex2] A: 0.26526 | B: 0.42764 | C: 0.35864
** [JOINT LOSS] ** : 1.027328
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002076 | Grad Max: 0.052877
  -> Layer: shared_layers.0.bias | Grad Mean: 0.136197 | Grad Max: 0.713654
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001998 | Grad Max: 0.008819
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014646 | Grad Max: 0.014646
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000970 | Grad Max: 0.067131
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018085 | Grad Max: 0.378815
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000228 | Grad Max: 0.006062
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008688 | Grad Max: 0.035410
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000479
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001894 | Grad Max: 0.004600
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000214
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000636 | Grad Max: 0.001757
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001155 | Grad Max: 0.003683
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019926 | Grad Max: 0.019926
[GRADIENT NORM TOTAL] 2.6737

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.155
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5561235 0.4438765] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 204/1844 | B: 250/1798 | C: 257/1791
[LOSS Ex1] A: 0.67733 | B: 0.67611 | C: 0.67367
[LOGITS Ex2 A] Mean Abs: 1.302 | Max: 5.325
[LOSS Ex2] A: 0.28020 | B: 0.40738 | C: 0.38627
** [JOINT LOSS] ** : 1.033653
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002318 | Grad Max: 0.051602
  -> Layer: shared_layers.0.bias | Grad Mean: 0.130000 | Grad Max: 0.642338
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002006 | Grad Max: 0.009074
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007045 | Grad Max: 0.007045
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000929 | Grad Max: 0.072907
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017266 | Grad Max: 0.405899
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.005603
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007816 | Grad Max: 0.029030
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000424
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001757 | Grad Max: 0.004262
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000225
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000586 | Grad Max: 0.001694
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001206 | Grad Max: 0.002736
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018335 | Grad Max: 0.018335
[GRADIENT NORM TOTAL] 2.5181

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.207
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5335658  0.46643415] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.019
[MASKS] A(Pass/Fail): 153/1463 | B: 231/1625 | C: 250/1798
[LOSS Ex1] A: 0.67643 | B: 0.67838 | C: 0.67334
[LOGITS Ex2 A] Mean Abs: 1.336 | Max: 5.641
[LOSS Ex2] A: 0.27065 | B: 0.40573 | C: 0.36689
** [JOINT LOSS] ** : 1.023805
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002827 | Grad Max: 0.069330
  -> Layer: shared_layers.0.bias | Grad Mean: 0.160580 | Grad Max: 0.747793
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001942 | Grad Max: 0.008686
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004169 | Grad Max: 0.004169
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001129 | Grad Max: 0.096703
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020838 | Grad Max: 0.554275
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000250 | Grad Max: 0.006250
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009475 | Grad Max: 0.032921
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000528
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002130 | Grad Max: 0.004940
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000269
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000702 | Grad Max: 0.002112
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001455 | Grad Max: 0.003946
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021690 | Grad Max: 0.021690
[GRADIENT NORM TOTAL] 3.1087

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.335
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5079225  0.49207756] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 211/1837 | B: 234/1814 | C: 262/1786
[LOSS Ex1] A: 0.67668 | B: 0.67796 | C: 0.67299
[LOGITS Ex2 A] Mean Abs: 1.347 | Max: 6.787
[LOSS Ex2] A: 0.27412 | B: 0.43545 | C: 0.36109
** [JOINT LOSS] ** : 1.032762
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002956 | Grad Max: 0.085333
  -> Layer: shared_layers.0.bias | Grad Mean: 0.105309 | Grad Max: 0.498212
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001937 | Grad Max: 0.008435
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003517 | Grad Max: 0.003518
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000810 | Grad Max: 0.068471
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014803 | Grad Max: 0.373996
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000176 | Grad Max: 0.005593
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006659 | Grad Max: 0.025481
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000463
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001507 | Grad Max: 0.004112
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000501 | Grad Max: 0.001455
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000884 | Grad Max: 0.002716
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015134 | Grad Max: 0.015134
[GRADIENT NORM TOTAL] 2.0248

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.317
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023615  0.49763855] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 200/1848 | B: 257/1791 | C: 279/1769
[LOSS Ex1] A: 0.67597 | B: 0.67835 | C: 0.67149
[LOGITS Ex2 A] Mean Abs: 1.337 | Max: 5.756
[LOSS Ex2] A: 0.27178 | B: 0.42632 | C: 0.34870
** [JOINT LOSS] ** : 1.024201
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003002 | Grad Max: 0.076742
  -> Layer: shared_layers.0.bias | Grad Mean: 0.098808 | Grad Max: 0.459054
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.009253
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010470 | Grad Max: 0.010470
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000757 | Grad Max: 0.050561
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013559 | Grad Max: 0.273367
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000154 | Grad Max: 0.004251
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005837 | Grad Max: 0.020700
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000376
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001340 | Grad Max: 0.003294
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000173
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000456 | Grad Max: 0.001346
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000809 | Grad Max: 0.002823
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013761 | Grad Max: 0.013761
[GRADIENT NORM TOTAL] 1.9296

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.326
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50444806 0.49555194] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 204/1844 | B: 253/1795 | C: 258/1790
[LOSS Ex1] A: 0.67505 | B: 0.67596 | C: 0.67421
[LOGITS Ex2 A] Mean Abs: 1.320 | Max: 5.917
[LOSS Ex2] A: 0.28902 | B: 0.41350 | C: 0.36174
** [JOINT LOSS] ** : 1.029824
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.068181
  -> Layer: shared_layers.0.bias | Grad Mean: 0.161142 | Grad Max: 0.778228
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.009773
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011687 | Grad Max: 0.011687
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001090 | Grad Max: 0.088751
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020233 | Grad Max: 0.495700
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.007143
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009427 | Grad Max: 0.036998
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000529
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002078 | Grad Max: 0.004738
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000244
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000693 | Grad Max: 0.002006
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001421 | Grad Max: 0.003137
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022162 | Grad Max: 0.022162
[GRADIENT NORM TOTAL] 3.0677

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.279
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50725543 0.49274454] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 189/1859 | B: 234/1622 | C: 245/1803
[LOSS Ex1] A: 0.67811 | B: 0.67824 | C: 0.67287
[LOGITS Ex2 A] Mean Abs: 1.313 | Max: 5.323
[LOSS Ex2] A: 0.26815 | B: 0.40779 | C: 0.35330
** [JOINT LOSS] ** : 1.019487
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004892 | Grad Max: 0.147861
  -> Layer: shared_layers.0.bias | Grad Mean: 0.208100 | Grad Max: 1.011533
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001864 | Grad Max: 0.007929
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002693 | Grad Max: 0.002693
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001577 | Grad Max: 0.119885
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029085 | Grad Max: 0.664113
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000356 | Grad Max: 0.009383
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013409 | Grad Max: 0.048520
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000778
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003020 | Grad Max: 0.008217
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000330
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000994 | Grad Max: 0.002641
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002164 | Grad Max: 0.004551
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031013 | Grad Max: 0.031013
[GRADIENT NORM TOTAL] 3.9611

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.193
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5153727  0.48462728] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.018
[MASKS] A(Pass/Fail): 173/1875 | B: 238/1810 | C: 226/1822
[LOSS Ex1] A: 0.67764 | B: 0.67781 | C: 0.67718
[LOGITS Ex2 A] Mean Abs: 1.301 | Max: 5.165
[LOSS Ex2] A: 0.28373 | B: 0.43932 | C: 0.38299
** [JOINT LOSS] ** : 1.046224
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.042228
  -> Layer: shared_layers.0.bias | Grad Mean: 0.094263 | Grad Max: 0.498553
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001856 | Grad Max: 0.008547
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010012 | Grad Max: 0.010012
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000691 | Grad Max: 0.054142
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012557 | Grad Max: 0.287896
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000137 | Grad Max: 0.004145
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005252 | Grad Max: 0.021414
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000391
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001117 | Grad Max: 0.003264
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000152
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000361 | Grad Max: 0.001068
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000608 | Grad Max: 0.001915
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010710 | Grad Max: 0.010710
[GRADIENT NORM TOTAL] 1.8484

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.303
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5605643  0.43943572] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.019
[MASKS] A(Pass/Fail): 210/1838 | B: 265/1783 | C: 175/1201
[LOSS Ex1] A: 0.67608 | B: 0.67821 | C: 0.67278
[LOGITS Ex2 A] Mean Abs: 1.345 | Max: 5.248
[LOSS Ex2] A: 0.26813 | B: 0.42030 | C: 0.36271
** [JOINT LOSS] ** : 1.026072
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001733 | Grad Max: 0.034189
  -> Layer: shared_layers.0.bias | Grad Mean: 0.095689 | Grad Max: 0.505889
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.008881
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008561 | Grad Max: 0.008561
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000709 | Grad Max: 0.076884
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012992 | Grad Max: 0.431158
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000145 | Grad Max: 0.005542
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005679 | Grad Max: 0.025151
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000399
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001268 | Grad Max: 0.004067
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000178
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000430 | Grad Max: 0.001219
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000754 | Grad Max: 0.002647
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012820 | Grad Max: 0.012820
[GRADIENT NORM TOTAL] 1.9602

[EPOCH SUMMARY] Train Loss: 1.0309

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0028 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0083 -> New: 1.0028)

############################## EPOCH 40/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.338
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029471 0.4970529] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 197/1851 | B: 262/1786 | C: 268/1780
[LOSS Ex1] A: 0.67849 | B: 0.67581 | C: 0.67186
[LOGITS Ex2 A] Mean Abs: 1.335 | Max: 4.911
[LOSS Ex2] A: 0.26524 | B: 0.40441 | C: 0.36721
** [JOINT LOSS] ** : 1.021010
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004468 | Grad Max: 0.112827
  -> Layer: shared_layers.0.bias | Grad Mean: 0.116551 | Grad Max: 0.534937
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001963 | Grad Max: 0.007887
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003120 | Grad Max: 0.003120
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000883 | Grad Max: 0.073040
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015528 | Grad Max: 0.390389
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000186 | Grad Max: 0.004435
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006876 | Grad Max: 0.025507
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000392
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001590 | Grad Max: 0.004286
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000204
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000524 | Grad Max: 0.001581
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001285 | Grad Max: 0.003257
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016926 | Grad Max: 0.016926
[GRADIENT NORM TOTAL] 2.1795

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.158
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.557271   0.44272906] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 215/1833 | B: 237/1619 | C: 251/1797
[LOSS Ex1] A: 0.67704 | B: 0.67810 | C: 0.67541
[LOGITS Ex2 A] Mean Abs: 1.348 | Max: 5.340
[LOSS Ex2] A: 0.27641 | B: 0.39672 | C: 0.36477
** [JOINT LOSS] ** : 1.022821
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001843 | Grad Max: 0.032823
  -> Layer: shared_layers.0.bias | Grad Mean: 0.037665 | Grad Max: 0.220912
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.009018
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011797 | Grad Max: 0.011797
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000318 | Grad Max: 0.042752
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005202 | Grad Max: 0.223902
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.002454
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001616 | Grad Max: 0.010153
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000260
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000329 | Grad Max: 0.001816
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000084
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000104 | Grad Max: 0.000509
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000495 | Grad Max: 0.001693
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003000 | Grad Max: 0.003000
[GRADIENT NORM TOTAL] 0.7658

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.211
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5343359  0.46566406] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.019
[MASKS] A(Pass/Fail): 175/1441 | B: 257/1791 | C: 262/1786
[LOSS Ex1] A: 0.67613 | B: 0.67766 | C: 0.67325
[LOGITS Ex2 A] Mean Abs: 1.409 | Max: 6.849
[LOSS Ex2] A: 0.26367 | B: 0.43860 | C: 0.39308
** [JOINT LOSS] ** : 1.040795
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003724 | Grad Max: 0.107392
  -> Layer: shared_layers.0.bias | Grad Mean: 0.130072 | Grad Max: 0.653533
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001960 | Grad Max: 0.008547
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000017 | Grad Max: 0.000017
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001021 | Grad Max: 0.105917
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019000 | Grad Max: 0.604004
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000211 | Grad Max: 0.005802
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007981 | Grad Max: 0.031555
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000432
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001775 | Grad Max: 0.004620
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000220
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000573 | Grad Max: 0.001749
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000988 | Grad Max: 0.002204
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016040 | Grad Max: 0.016040
[GRADIENT NORM TOTAL] 2.5779

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.340
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078193 0.4921807] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 235/1813 | B: 283/1765 | C: 267/1781
[LOSS Ex1] A: 0.67637 | B: 0.67806 | C: 0.67133
[LOGITS Ex2 A] Mean Abs: 1.380 | Max: 6.020
[LOSS Ex2] A: 0.26922 | B: 0.41743 | C: 0.37111
** [JOINT LOSS] ** : 1.027839
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002370 | Grad Max: 0.057503
  -> Layer: shared_layers.0.bias | Grad Mean: 0.058253 | Grad Max: 0.247137
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.008995
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008818 | Grad Max: 0.008818
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000455 | Grad Max: 0.049657
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008111 | Grad Max: 0.270929
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.003851
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003200 | Grad Max: 0.016477
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000289
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000732 | Grad Max: 0.002326
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000140
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000239 | Grad Max: 0.000986
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000448 | Grad Max: 0.001807
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006737 | Grad Max: 0.006737
[GRADIENT NORM TOTAL] 1.1591

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.321
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50257045 0.4974296 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.019
[MASKS] A(Pass/Fail): 226/1822 | B: 281/1767 | C: 225/1823
[LOSS Ex1] A: 0.67562 | B: 0.67564 | C: 0.67537
[LOGITS Ex2 A] Mean Abs: 1.352 | Max: 5.255
[LOSS Ex2] A: 0.26527 | B: 0.41613 | C: 0.36251
** [JOINT LOSS] ** : 1.023507
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004136 | Grad Max: 0.085456
  -> Layer: shared_layers.0.bias | Grad Mean: 0.226329 | Grad Max: 1.118514
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.009572
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015711 | Grad Max: 0.015711
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001675 | Grad Max: 0.196294
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031568 | Grad Max: 1.109839
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.010000
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014036 | Grad Max: 0.055863
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000762
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003111 | Grad Max: 0.007622
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000320
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001013 | Grad Max: 0.002510
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002009 | Grad Max: 0.004628
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030483 | Grad Max: 0.030483
[GRADIENT NORM TOTAL] 4.5355

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.330
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50425625 0.49574378] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.019
[MASKS] A(Pass/Fail): 223/1825 | B: 249/1607 | C: 260/1788
[LOSS Ex1] A: 0.67470 | B: 0.67793 | C: 0.67309
[LOGITS Ex2 A] Mean Abs: 1.334 | Max: 5.474
[LOSS Ex2] A: 0.29247 | B: 0.40468 | C: 0.36444
** [JOINT LOSS] ** : 1.029108
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005170 | Grad Max: 0.112255
  -> Layer: shared_layers.0.bias | Grad Mean: 0.293875 | Grad Max: 1.521630
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002013 | Grad Max: 0.008952
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006720 | Grad Max: 0.006720
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002113 | Grad Max: 0.196565
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039829 | Grad Max: 1.100478
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000465 | Grad Max: 0.011420
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017901 | Grad Max: 0.061010
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000927
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003952 | Grad Max: 0.008857
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000404
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001285 | Grad Max: 0.003252
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002510 | Grad Max: 0.004887
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039128 | Grad Max: 0.039128
[GRADIENT NORM TOTAL] 5.7510

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.282
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070765  0.49292347] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 211/1837 | B: 260/1788 | C: 227/1821
[LOSS Ex1] A: 0.67782 | B: 0.67749 | C: 0.67481
[LOGITS Ex2 A] Mean Abs: 1.345 | Max: 5.187
[LOSS Ex2] A: 0.27434 | B: 0.42952 | C: 0.37377
** [JOINT LOSS] ** : 1.035917
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002226 | Grad Max: 0.064049
  -> Layer: shared_layers.0.bias | Grad Mean: 0.054346 | Grad Max: 0.274009
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001821 | Grad Max: 0.007727
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004482 | Grad Max: 0.004482
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000490 | Grad Max: 0.102765
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008460 | Grad Max: 0.580881
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000085 | Grad Max: 0.003336
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003069 | Grad Max: 0.018995
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000242
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000694 | Grad Max: 0.002398
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000113
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000226 | Grad Max: 0.000840
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000548 | Grad Max: 0.001809
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007361 | Grad Max: 0.007361
[GRADIENT NORM TOTAL] 1.2967

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.195
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5158098  0.48419026] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 203/1845 | B: 286/1762 | C: 271/1777
[LOSS Ex1] A: 0.67735 | B: 0.67789 | C: 0.67138
[LOGITS Ex2 A] Mean Abs: 1.364 | Max: 5.437
[LOSS Ex2] A: 0.29598 | B: 0.44064 | C: 0.35815
** [JOINT LOSS] ** : 1.040463
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006642 | Grad Max: 0.169095
  -> Layer: shared_layers.0.bias | Grad Mean: 0.365065 | Grad Max: 1.860129
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001909 | Grad Max: 0.008092
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000394 | Grad Max: 0.000394
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002616 | Grad Max: 0.214155
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049485 | Grad Max: 1.137242
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000600 | Grad Max: 0.013953
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023134 | Grad Max: 0.078732
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001081
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005098 | Grad Max: 0.011175
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000480
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001663 | Grad Max: 0.004183
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003271 | Grad Max: 0.005992
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051198 | Grad Max: 0.051198
[GRADIENT NORM TOTAL] 7.1506

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.308
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5622334 0.4377666] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 236/1812 | B: 283/1765 | C: 242/1806
[LOSS Ex1] A: 0.67573 | B: 0.67546 | C: 0.67400
[LOGITS Ex2 A] Mean Abs: 1.408 | Max: 5.076
[LOSS Ex2] A: 0.27194 | B: 0.44354 | C: 0.37259
** [JOINT LOSS] ** : 1.037750
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008904 | Grad Max: 0.225678
  -> Layer: shared_layers.0.bias | Grad Mean: 0.503962 | Grad Max: 2.529361
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.009759
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016153 | Grad Max: 0.016153
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003649 | Grad Max: 0.282990
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.068824 | Grad Max: 1.584049
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000833 | Grad Max: 0.018867
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.032034 | Grad Max: 0.107083
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000129 | Grad Max: 0.001597
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007063 | Grad Max: 0.016663
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000059 | Grad Max: 0.000650
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002300 | Grad Max: 0.005533
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004534 | Grad Max: 0.007986
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.070967 | Grad Max: 0.070967
[GRADIENT NORM TOTAL] 9.8683

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.343
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5027356 0.4972644] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 216/1832 | B: 251/1605 | C: 250/1798
[LOSS Ex1] A: 0.67820 | B: 0.67778 | C: 0.67398
[LOGITS Ex2 A] Mean Abs: 1.414 | Max: 5.135
[LOSS Ex2] A: 0.26965 | B: 0.40345 | C: 0.35129
** [JOINT LOSS] ** : 1.018116
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005821 | Grad Max: 0.143468
  -> Layer: shared_layers.0.bias | Grad Mean: 0.315927 | Grad Max: 1.666891
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001866 | Grad Max: 0.008445
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009779 | Grad Max: 0.009779
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002326 | Grad Max: 0.161288
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044320 | Grad Max: 0.877876
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000549 | Grad Max: 0.013886
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021215 | Grad Max: 0.079332
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001005
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004655 | Grad Max: 0.009999
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000429
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001527 | Grad Max: 0.003793
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002945 | Grad Max: 0.005965
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047742 | Grad Max: 0.047742
[GRADIENT NORM TOTAL] 6.2451

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.162
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55865943 0.4413405 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 237/1811 | B: 262/1786 | C: 231/1817
[LOSS Ex1] A: 0.67672 | B: 0.67734 | C: 0.67531
[LOGITS Ex2 A] Mean Abs: 1.372 | Max: 5.418
[LOSS Ex2] A: 0.26450 | B: 0.43684 | C: 0.37288
** [JOINT LOSS] ** : 1.034530
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002619 | Grad Max: 0.076471
  -> Layer: shared_layers.0.bias | Grad Mean: 0.140799 | Grad Max: 0.647826
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001896 | Grad Max: 0.008436
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008072 | Grad Max: 0.008072
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001003 | Grad Max: 0.107010
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018560 | Grad Max: 0.595377
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000217 | Grad Max: 0.006498
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008418 | Grad Max: 0.036111
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000444
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001883 | Grad Max: 0.004792
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000216
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000615 | Grad Max: 0.001733
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001294 | Grad Max: 0.002683
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019530 | Grad Max: 0.019530
[GRADIENT NORM TOTAL] 2.7186

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.216
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53533083 0.4646692 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 188/1428 | B: 291/1757 | C: 271/1777
[LOSS Ex1] A: 0.67581 | B: 0.67775 | C: 0.67264
[LOGITS Ex2 A] Mean Abs: 1.389 | Max: 6.229
[LOSS Ex2] A: 0.26224 | B: 0.42287 | C: 0.36336
** [JOINT LOSS] ** : 1.024891
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004009 | Grad Max: 0.090402
  -> Layer: shared_layers.0.bias | Grad Mean: 0.224128 | Grad Max: 1.146800
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.008771
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005162 | Grad Max: 0.005162
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001610 | Grad Max: 0.142671
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030206 | Grad Max: 0.804746
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.008072
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013925 | Grad Max: 0.045890
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000685
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003076 | Grad Max: 0.006926
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000287
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001001 | Grad Max: 0.002542
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002038 | Grad Max: 0.003954
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031502 | Grad Max: 0.031502
[GRADIENT NORM TOTAL] 4.4075

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50765514 0.49234492] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 250/1798 | B: 285/1763 | C: 261/1787
[LOSS Ex1] A: 0.67604 | B: 0.67532 | C: 0.67258
[LOGITS Ex2 A] Mean Abs: 1.402 | Max: 6.683
[LOSS Ex2] A: 0.26495 | B: 0.41150 | C: 0.33119
** [JOINT LOSS] ** : 1.010530
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002241 | Grad Max: 0.045802
  -> Layer: shared_layers.0.bias | Grad Mean: 0.121416 | Grad Max: 0.641891
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002009 | Grad Max: 0.008703
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006368 | Grad Max: 0.006368
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000892 | Grad Max: 0.098443
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016458 | Grad Max: 0.547800
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.006060
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007682 | Grad Max: 0.027814
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000464
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001701 | Grad Max: 0.004533
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000537 | Grad Max: 0.001490
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000991 | Grad Max: 0.002781
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015209 | Grad Max: 0.015209
[GRADIENT NORM TOTAL] 2.4760

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.325
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50278413 0.4972159 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 242/1806 | B: 254/1602 | C: 175/1201
[LOSS Ex1] A: 0.67529 | B: 0.67765 | C: 0.67191
[LOGITS Ex2 A] Mean Abs: 1.400 | Max: 6.019
[LOSS Ex2] A: 0.27239 | B: 0.41579 | C: 0.37350
** [JOINT LOSS] ** : 1.028840
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004561 | Grad Max: 0.107642
  -> Layer: shared_layers.0.bias | Grad Mean: 0.228635 | Grad Max: 1.123531
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002018 | Grad Max: 0.009119
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007090 | Grad Max: 0.007090
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001690 | Grad Max: 0.133100
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031421 | Grad Max: 0.734520
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.008223
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014180 | Grad Max: 0.048163
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000717
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003134 | Grad Max: 0.007344
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000330
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001017 | Grad Max: 0.002577
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001894 | Grad Max: 0.003989
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031262 | Grad Max: 0.031262
[GRADIENT NORM TOTAL] 4.5339

[EPOCH SUMMARY] Train Loss: 1.0283

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 1.0087 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 41/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.333
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040726  0.49592736] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 239/1809 | B: 265/1783 | C: 240/1808
[LOSS Ex1] A: 0.67438 | B: 0.67720 | C: 0.67354
[LOGITS Ex2 A] Mean Abs: 1.415 | Max: 5.954
[LOSS Ex2] A: 0.28264 | B: 0.44915 | C: 0.35189
** [JOINT LOSS] ** : 1.036264
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007176 | Grad Max: 0.194677
  -> Layer: shared_layers.0.bias | Grad Mean: 0.351506 | Grad Max: 1.723177
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.009838
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014813 | Grad Max: 0.014813
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002554 | Grad Max: 0.205361
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048120 | Grad Max: 1.119263
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000571 | Grad Max: 0.013766
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022048 | Grad Max: 0.075310
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.001012
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004868 | Grad Max: 0.010996
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000444
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001578 | Grad Max: 0.003834
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003023 | Grad Max: 0.005494
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048925 | Grad Max: 0.048925
[GRADIENT NORM TOTAL] 6.8108

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.285
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5069191 0.4930809] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 222/1826 | B: 292/1756 | C: 262/1786
[LOSS Ex1] A: 0.67757 | B: 0.67762 | C: 0.67236
[LOGITS Ex2 A] Mean Abs: 1.377 | Max: 5.228
[LOSS Ex2] A: 0.26599 | B: 0.42682 | C: 0.34859
** [JOINT LOSS] ** : 1.022979
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003203 | Grad Max: 0.075445
  -> Layer: shared_layers.0.bias | Grad Mean: 0.199351 | Grad Max: 0.956747
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001854 | Grad Max: 0.007668
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002904 | Grad Max: 0.002904
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001398 | Grad Max: 0.121428
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025945 | Grad Max: 0.678230
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000303 | Grad Max: 0.007347
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011794 | Grad Max: 0.040918
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000615
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002597 | Grad Max: 0.006086
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000263
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000849 | Grad Max: 0.002214
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001490 | Grad Max: 0.003936
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025545 | Grad Max: 0.025545
[GRADIENT NORM TOTAL] 3.9280

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.197
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51615    0.48384997] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.518 | Std: 0.019
[MASKS] A(Pass/Fail): 225/1823 | B: 286/1762 | C: 241/1807
[LOSS Ex1] A: 0.67710 | B: 0.67518 | C: 0.67333
[LOGITS Ex2 A] Mean Abs: 1.303 | Max: 5.322
[LOSS Ex2] A: 0.28266 | B: 0.41595 | C: 0.36641
** [JOINT LOSS] ** : 1.030211
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005290 | Grad Max: 0.119424
  -> Layer: shared_layers.0.bias | Grad Mean: 0.283566 | Grad Max: 1.441921
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001942 | Grad Max: 0.008211
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000814 | Grad Max: 0.000814
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.224933
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038596 | Grad Max: 1.272223
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000464 | Grad Max: 0.010622
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017993 | Grad Max: 0.062494
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000851
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003969 | Grad Max: 0.008783
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000370
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001288 | Grad Max: 0.003139
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002448 | Grad Max: 0.004718
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038603 | Grad Max: 0.038603
[GRADIENT NORM TOTAL] 5.6415

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.311
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.56352425 0.4364758 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.020
[MASKS] A(Pass/Fail): 248/1800 | B: 255/1601 | C: 251/1797
[LOSS Ex1] A: 0.67544 | B: 0.67753 | C: 0.67233
[LOGITS Ex2 A] Mean Abs: 1.363 | Max: 5.709
[LOSS Ex2] A: 0.26039 | B: 0.42267 | C: 0.36363
** [JOINT LOSS] ** : 1.023997
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006233 | Grad Max: 0.135248
  -> Layer: shared_layers.0.bias | Grad Mean: 0.376423 | Grad Max: 1.854193
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.009471
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014678 | Grad Max: 0.014678
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002685 | Grad Max: 0.217696
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050581 | Grad Max: 1.222566
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000612 | Grad Max: 0.014255
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023693 | Grad Max: 0.079564
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.001133
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005199 | Grad Max: 0.011867
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000455
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001681 | Grad Max: 0.004032
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003206 | Grad Max: 0.006090
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051063 | Grad Max: 0.051063
[GRADIENT NORM TOTAL] 7.3864

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.347
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026144 0.4973856] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 228/1820 | B: 269/1779 | C: 279/1769
[LOSS Ex1] A: 0.67796 | B: 0.67709 | C: 0.67045
[LOGITS Ex2 A] Mean Abs: 1.375 | Max: 5.039
[LOSS Ex2] A: 0.24896 | B: 0.43449 | C: 0.35177
** [JOINT LOSS] ** : 1.020241
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005210 | Grad Max: 0.131115
  -> Layer: shared_layers.0.bias | Grad Mean: 0.227558 | Grad Max: 1.127227
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001915 | Grad Max: 0.007965
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003479 | Grad Max: 0.003479
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001681 | Grad Max: 0.138360
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031109 | Grad Max: 0.777677
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000371 | Grad Max: 0.009418
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014220 | Grad Max: 0.047524
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000701
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003171 | Grad Max: 0.007605
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000331
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001024 | Grad Max: 0.002757
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002115 | Grad Max: 0.003927
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031647 | Grad Max: 0.031647
[GRADIENT NORM TOTAL] 4.4075

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.164
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5596446 0.4403554] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 245/1803 | B: 295/1753 | C: 242/1806
[LOSS Ex1] A: 0.67648 | B: 0.67751 | C: 0.67471
[LOGITS Ex2 A] Mean Abs: 1.409 | Max: 5.157
[LOSS Ex2] A: 0.27450 | B: 0.43269 | C: 0.36316
** [JOINT LOSS] ** : 1.033017
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003551 | Grad Max: 0.078374
  -> Layer: shared_layers.0.bias | Grad Mean: 0.213775 | Grad Max: 1.092499
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001914 | Grad Max: 0.008703
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010556 | Grad Max: 0.010556
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001519 | Grad Max: 0.126509
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028666 | Grad Max: 0.739060
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000335 | Grad Max: 0.009424
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013043 | Grad Max: 0.051688
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000672
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002854 | Grad Max: 0.007163
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000281
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000918 | Grad Max: 0.002447
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001651 | Grad Max: 0.003530
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027240 | Grad Max: 0.027240
[GRADIENT NORM TOTAL] 4.2295

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.219
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5359657  0.46403435] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.020
[MASKS] A(Pass/Fail): 202/1414 | B: 288/1760 | C: 246/1802
[LOSS Ex1] A: 0.67557 | B: 0.67507 | C: 0.67510
[LOGITS Ex2 A] Mean Abs: 1.449 | Max: 7.200
[LOSS Ex2] A: 0.25691 | B: 0.42673 | C: 0.36736
** [JOINT LOSS] ** : 1.025577
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005055 | Grad Max: 0.118786
  -> Layer: shared_layers.0.bias | Grad Mean: 0.311351 | Grad Max: 1.564967
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.009144
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007635 | Grad Max: 0.007635
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.175879
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041289 | Grad Max: 1.006615
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000496 | Grad Max: 0.012126
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019271 | Grad Max: 0.069555
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000879
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004224 | Grad Max: 0.009123
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000377
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001371 | Grad Max: 0.003353
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002578 | Grad Max: 0.004964
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042173 | Grad Max: 0.042173
[GRADIENT NORM TOTAL] 6.0208

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.348
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5075727 0.4924273] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 258/1790 | B: 260/1596 | C: 244/1804
[LOSS Ex1] A: 0.67581 | B: 0.67743 | C: 0.67432
[LOGITS Ex2 A] Mean Abs: 1.415 | Max: 5.726
[LOSS Ex2] A: 0.26779 | B: 0.39479 | C: 0.37218
** [JOINT LOSS] ** : 1.020772
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003095 | Grad Max: 0.066644
  -> Layer: shared_layers.0.bias | Grad Mean: 0.155125 | Grad Max: 0.821169
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001930 | Grad Max: 0.008793
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009917 | Grad Max: 0.009917
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001184 | Grad Max: 0.102034
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022053 | Grad Max: 0.581166
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000249 | Grad Max: 0.006133
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009653 | Grad Max: 0.036841
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000492
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002130 | Grad Max: 0.005171
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000253
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000697 | Grad Max: 0.001903
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001311 | Grad Max: 0.003206
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021530 | Grad Max: 0.021530
[GRADIENT NORM TOTAL] 3.1550

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.328
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5028795 0.4971205] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 253/1795 | B: 272/1776 | C: 262/1786
[LOSS Ex1] A: 0.67504 | B: 0.67698 | C: 0.67404
[LOGITS Ex2 A] Mean Abs: 1.373 | Max: 5.088
[LOSS Ex2] A: 0.26964 | B: 0.43809 | C: 0.37342
** [JOINT LOSS] ** : 1.035739
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004007 | Grad Max: 0.101677
  -> Layer: shared_layers.0.bias | Grad Mean: 0.258778 | Grad Max: 1.241038
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.009419
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012977 | Grad Max: 0.012977
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001810 | Grad Max: 0.147480
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033868 | Grad Max: 0.842740
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000405 | Grad Max: 0.010139
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015750 | Grad Max: 0.054388
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000714
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003493 | Grad Max: 0.007852
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000352
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001148 | Grad Max: 0.002890
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002361 | Grad Max: 0.004381
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036629 | Grad Max: 0.036629
[GRADIENT NORM TOTAL] 4.9919

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.336
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.503973   0.49602696] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 248/1800 | B: 296/1752 | C: 238/1810
[LOSS Ex1] A: 0.67414 | B: 0.67741 | C: 0.67443
[LOGITS Ex2 A] Mean Abs: 1.353 | Max: 5.710
[LOSS Ex2] A: 0.27601 | B: 0.44080 | C: 0.38342
** [JOINT LOSS] ** : 1.042070
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005498 | Grad Max: 0.141402
  -> Layer: shared_layers.0.bias | Grad Mean: 0.342538 | Grad Max: 1.679522
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.008809
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006434 | Grad Max: 0.006434
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002384 | Grad Max: 0.246387
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045286 | Grad Max: 1.371594
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000543 | Grad Max: 0.013041
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021111 | Grad Max: 0.071785
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000928
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004615 | Grad Max: 0.009924
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000447
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001510 | Grad Max: 0.003825
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003039 | Grad Max: 0.005839
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047868 | Grad Max: 0.047868
[GRADIENT NORM TOTAL] 6.6943

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.287
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50682634 0.49317366] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 230/1818 | B: 290/1758 | C: 274/1774
[LOSS Ex1] A: 0.67738 | B: 0.67496 | C: 0.67080
[LOGITS Ex2 A] Mean Abs: 1.331 | Max: 5.437
[LOSS Ex2] A: 0.26364 | B: 0.40523 | C: 0.38011
** [JOINT LOSS] ** : 1.024040
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003295 | Grad Max: 0.084674
  -> Layer: shared_layers.0.bias | Grad Mean: 0.184452 | Grad Max: 0.804883
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001968 | Grad Max: 0.007202
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003365 | Grad Max: 0.003365
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001295 | Grad Max: 0.120673
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024136 | Grad Max: 0.691168
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.007284
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011300 | Grad Max: 0.040163
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000582
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002494 | Grad Max: 0.005793
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000261
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000802 | Grad Max: 0.002170
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001615 | Grad Max: 0.003693
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024642 | Grad Max: 0.024642
[GRADIENT NORM TOTAL] 3.4953

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.198
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.516367   0.48363304] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 231/1817 | B: 261/1595 | C: 235/1813
[LOSS Ex1] A: 0.67692 | B: 0.67733 | C: 0.67437
[LOGITS Ex2 A] Mean Abs: 1.359 | Max: 5.436
[LOSS Ex2] A: 0.27637 | B: 0.40349 | C: 0.37387
** [JOINT LOSS] ** : 1.027451
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004180 | Grad Max: 0.091607
  -> Layer: shared_layers.0.bias | Grad Mean: 0.226722 | Grad Max: 1.119768
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001779 | Grad Max: 0.007493
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002521 | Grad Max: 0.002521
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001619 | Grad Max: 0.135182
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030432 | Grad Max: 0.759091
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.011050
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013903 | Grad Max: 0.057103
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000688
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003058 | Grad Max: 0.006733
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000306
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000997 | Grad Max: 0.002587
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001917 | Grad Max: 0.004109
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030880 | Grad Max: 0.030880
[GRADIENT NORM TOTAL] 4.4044

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.314
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5644704 0.4355296] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.020
[MASKS] A(Pass/Fail): 256/1792 | B: 273/1775 | C: 282/1766
[LOSS Ex1] A: 0.67523 | B: 0.67688 | C: 0.67085
[LOGITS Ex2 A] Mean Abs: 1.419 | Max: 5.219
[LOSS Ex2] A: 0.26946 | B: 0.43785 | C: 0.35078
** [JOINT LOSS] ** : 1.027016
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008139 | Grad Max: 0.209688
  -> Layer: shared_layers.0.bias | Grad Mean: 0.403954 | Grad Max: 1.976498
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.009293
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013015 | Grad Max: 0.013015
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002923 | Grad Max: 0.222338
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054857 | Grad Max: 1.186575
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000657 | Grad Max: 0.014319
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025431 | Grad Max: 0.086142
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001126
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005623 | Grad Max: 0.012461
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000541
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001837 | Grad Max: 0.004484
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003572 | Grad Max: 0.007025
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057563 | Grad Max: 0.057563
[GRADIENT NORM TOTAL] 7.7500

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.350
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50253385 0.4974661 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 242/1806 | B: 298/1750 | C: 184/1192
[LOSS Ex1] A: 0.67779 | B: 0.67731 | C: 0.66958
[LOGITS Ex2 A] Mean Abs: 1.388 | Max: 5.126
[LOSS Ex2] A: 0.26434 | B: 0.43361 | C: 0.35407
** [JOINT LOSS] ** : 1.025565
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002725 | Grad Max: 0.082938
  -> Layer: shared_layers.0.bias | Grad Mean: 0.222302 | Grad Max: 1.085907
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.007930
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003392 | Grad Max: 0.003392
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001515 | Grad Max: 0.133731
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028767 | Grad Max: 0.752768
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000352 | Grad Max: 0.008620
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013794 | Grad Max: 0.050141
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000736
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002992 | Grad Max: 0.007082
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000287
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000971 | Grad Max: 0.002529
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001672 | Grad Max: 0.003733
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029033 | Grad Max: 0.029033
[GRADIENT NORM TOTAL] 4.3617

[EPOCH SUMMARY] Train Loss: 1.0282

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9974 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 1.0028 -> New: 0.9974)

############################## EPOCH 42/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.166
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.560389   0.43961108] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 252/1796 | B: 291/1757 | C: 247/1801
[LOSS Ex1] A: 0.67629 | B: 0.67486 | C: 0.67452
[LOGITS Ex2 A] Mean Abs: 1.359 | Max: 5.559
[LOSS Ex2] A: 0.26835 | B: 0.40203 | C: 0.34636
** [JOINT LOSS] ** : 1.014135
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003245 | Grad Max: 0.071985
  -> Layer: shared_layers.0.bias | Grad Mean: 0.192243 | Grad Max: 0.940855
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.009127
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013784 | Grad Max: 0.013784
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001357 | Grad Max: 0.168812
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025489 | Grad Max: 0.956262
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000292 | Grad Max: 0.007638
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011276 | Grad Max: 0.039603
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000571
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002489 | Grad Max: 0.005665
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000244
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000803 | Grad Max: 0.002068
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001619 | Grad Max: 0.003386
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024798 | Grad Max: 0.024798
[GRADIENT NORM TOTAL] 3.8409

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.222
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5364668  0.46353316] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.020
[MASKS] A(Pass/Fail): 208/1408 | B: 263/1593 | C: 249/1799
[LOSS Ex1] A: 0.67537 | B: 0.67724 | C: 0.67313
[LOGITS Ex2 A] Mean Abs: 1.383 | Max: 5.274
[LOSS Ex2] A: 0.24829 | B: 0.41124 | C: 0.35268
** [JOINT LOSS] ** : 1.012651
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004859 | Grad Max: 0.107941
  -> Layer: shared_layers.0.bias | Grad Mean: 0.294331 | Grad Max: 1.479696
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001954 | Grad Max: 0.009052
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007801 | Grad Max: 0.007801
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002091 | Grad Max: 0.207784
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039518 | Grad Max: 1.156150
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000469 | Grad Max: 0.012063
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018260 | Grad Max: 0.065612
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000897
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004021 | Grad Max: 0.009438
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000401
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001299 | Grad Max: 0.003338
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002494 | Grad Max: 0.005281
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039568 | Grad Max: 0.039568
[GRADIENT NORM TOTAL] 5.8424

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.351
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50751626 0.4924838 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 265/1783 | B: 276/1772 | C: 247/1801
[LOSS Ex1] A: 0.67562 | B: 0.67679 | C: 0.67298
[LOGITS Ex2 A] Mean Abs: 1.394 | Max: 6.754
[LOSS Ex2] A: 0.26664 | B: 0.43684 | C: 0.34925
** [JOINT LOSS] ** : 1.026037
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003677 | Grad Max: 0.061192
  -> Layer: shared_layers.0.bias | Grad Mean: 0.143387 | Grad Max: 0.702649
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002052 | Grad Max: 0.008997
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014131 | Grad Max: 0.014131
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001100 | Grad Max: 0.111442
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020079 | Grad Max: 0.612067
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000233 | Grad Max: 0.008326
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008948 | Grad Max: 0.039915
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000504
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001999 | Grad Max: 0.004856
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000199
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000643 | Grad Max: 0.001726
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001290 | Grad Max: 0.002896
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019358 | Grad Max: 0.019358
[GRADIENT NORM TOTAL] 2.8340

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.330
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029721  0.49702787] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.021
[MASKS] A(Pass/Fail): 266/1782 | B: 301/1747 | C: 262/1786
[LOSS Ex1] A: 0.67484 | B: 0.67722 | C: 0.67368
[LOGITS Ex2 A] Mean Abs: 1.422 | Max: 5.062
[LOSS Ex2] A: 0.26713 | B: 0.43228 | C: 0.38264
** [JOINT LOSS] ** : 1.035930
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006201 | Grad Max: 0.156582
  -> Layer: shared_layers.0.bias | Grad Mean: 0.276992 | Grad Max: 1.354113
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.009683
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016964 | Grad Max: 0.016964
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002005 | Grad Max: 0.171788
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037678 | Grad Max: 0.918789
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.010751
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017384 | Grad Max: 0.061440
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000851
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003842 | Grad Max: 0.009125
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000393
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001255 | Grad Max: 0.003160
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002456 | Grad Max: 0.004414
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038677 | Grad Max: 0.038677
[GRADIENT NORM TOTAL] 5.3559

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.339
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5038848 0.4961152] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.021
[MASKS] A(Pass/Fail): 257/1791 | B: 293/1755 | C: 289/1759
[LOSS Ex1] A: 0.67394 | B: 0.67476 | C: 0.67002
[LOGITS Ex2 A] Mean Abs: 1.404 | Max: 5.286
[LOSS Ex2] A: 0.28781 | B: 0.41574 | C: 0.35214
** [JOINT LOSS] ** : 1.024804
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006875 | Grad Max: 0.192136
  -> Layer: shared_layers.0.bias | Grad Mean: 0.356231 | Grad Max: 1.660360
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.008930
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004776 | Grad Max: 0.004776
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002577 | Grad Max: 0.210289
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048120 | Grad Max: 1.153459
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000574 | Grad Max: 0.012643
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022213 | Grad Max: 0.072620
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.001249
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004929 | Grad Max: 0.011534
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000444
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001607 | Grad Max: 0.003893
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003028 | Grad Max: 0.005811
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.049944 | Grad Max: 0.049944
[GRADIENT NORM TOTAL] 6.9627

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.289
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067477  0.49325228] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 236/1812 | B: 268/1588 | C: 219/1829
[LOSS Ex1] A: 0.67722 | B: 0.67715 | C: 0.67564
[LOGITS Ex2 A] Mean Abs: 1.375 | Max: 5.665
[LOSS Ex2] A: 0.25932 | B: 0.39650 | C: 0.38786
** [JOINT LOSS] ** : 1.024563
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002890 | Grad Max: 0.063729
  -> Layer: shared_layers.0.bias | Grad Mean: 0.157313 | Grad Max: 0.758484
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001735 | Grad Max: 0.007558
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002315 | Grad Max: 0.002315
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001123 | Grad Max: 0.110326
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021414 | Grad Max: 0.642948
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.006364
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010009 | Grad Max: 0.036666
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000522
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002182 | Grad Max: 0.005347
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000244
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000712 | Grad Max: 0.001942
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001233 | Grad Max: 0.003366
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021484 | Grad Max: 0.021484
[GRADIENT NORM TOTAL] 3.1693

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.200
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51657516 0.4834248 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 239/1809 | B: 277/1771 | C: 274/1774
[LOSS Ex1] A: 0.67675 | B: 0.67670 | C: 0.66950
[LOGITS Ex2 A] Mean Abs: 1.307 | Max: 5.596
[LOSS Ex2] A: 0.26268 | B: 0.43257 | C: 0.36114
** [JOINT LOSS] ** : 1.026446
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005865 | Grad Max: 0.147466
  -> Layer: shared_layers.0.bias | Grad Mean: 0.293617 | Grad Max: 1.492421
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002075 | Grad Max: 0.007506
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009379 | Grad Max: 0.009379
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.140582
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040154 | Grad Max: 0.782750
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000487 | Grad Max: 0.010976
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019006 | Grad Max: 0.064234
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000934
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004210 | Grad Max: 0.010082
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000374
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001362 | Grad Max: 0.003362
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002655 | Grad Max: 0.005012
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041677 | Grad Max: 0.041677
[GRADIENT NORM TOTAL] 5.6636

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.317
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5653543  0.43464568] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 265/1783 | B: 301/1747 | C: 258/1790
[LOSS Ex1] A: 0.67503 | B: 0.67713 | C: 0.67364
[LOGITS Ex2 A] Mean Abs: 1.346 | Max: 5.458
[LOSS Ex2] A: 0.24912 | B: 0.43067 | C: 0.36525
** [JOINT LOSS] ** : 1.023616
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006408 | Grad Max: 0.135141
  -> Layer: shared_layers.0.bias | Grad Mean: 0.371652 | Grad Max: 1.791725
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.009906
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.020798 | Grad Max: 0.020798
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002581 | Grad Max: 0.229560
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049112 | Grad Max: 1.299398
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000587 | Grad Max: 0.016037
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022885 | Grad Max: 0.086307
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001109
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005048 | Grad Max: 0.011963
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000476
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001630 | Grad Max: 0.003976
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002999 | Grad Max: 0.005468
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048582 | Grad Max: 0.048582
[GRADIENT NORM TOTAL] 7.0928

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.353
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5024494  0.49755055] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.021
[MASKS] A(Pass/Fail): 249/1799 | B: 295/1753 | C: 242/1806
[LOSS Ex1] A: 0.67762 | B: 0.67466 | C: 0.67320
[LOGITS Ex2 A] Mean Abs: 1.370 | Max: 5.132
[LOSS Ex2] A: 0.25147 | B: 0.40781 | C: 0.35806
** [JOINT LOSS] ** : 1.014278
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004023 | Grad Max: 0.116079
  -> Layer: shared_layers.0.bias | Grad Mean: 0.175196 | Grad Max: 0.844326
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001904 | Grad Max: 0.007840
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003986 | Grad Max: 0.003986
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001291 | Grad Max: 0.074996
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024176 | Grad Max: 0.422759
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000297 | Grad Max: 0.007762
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011530 | Grad Max: 0.040509
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000562
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002545 | Grad Max: 0.005893
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000222
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000817 | Grad Max: 0.002072
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001625 | Grad Max: 0.003535
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024985 | Grad Max: 0.024985
[GRADIENT NORM TOTAL] 3.3189

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.168
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.56112516 0.43887484] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.021
[MASKS] A(Pass/Fail): 259/1789 | B: 269/1587 | C: 239/1809
[LOSS Ex1] A: 0.67612 | B: 0.67707 | C: 0.67313
[LOGITS Ex2 A] Mean Abs: 1.406 | Max: 5.503
[LOSS Ex2] A: 0.27210 | B: 0.41012 | C: 0.34904
** [JOINT LOSS] ** : 1.019190
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005228 | Grad Max: 0.140737
  -> Layer: shared_layers.0.bias | Grad Mean: 0.251122 | Grad Max: 1.263717
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001861 | Grad Max: 0.008027
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001729 | Grad Max: 0.001729
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001866 | Grad Max: 0.149937
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034593 | Grad Max: 0.815124
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000396 | Grad Max: 0.009953
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015375 | Grad Max: 0.052636
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000781
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003417 | Grad Max: 0.008500
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000356
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001122 | Grad Max: 0.002847
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002111 | Grad Max: 0.004694
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034829 | Grad Max: 0.034829
[GRADIENT NORM TOTAL] 4.9167

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.224
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5369349  0.46306515] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 216/1400 | B: 278/1770 | C: 255/1793
[LOSS Ex1] A: 0.67520 | B: 0.67660 | C: 0.67273
[LOGITS Ex2 A] Mean Abs: 1.457 | Max: 4.766
[LOSS Ex2] A: 0.25749 | B: 0.43674 | C: 0.36848
** [JOINT LOSS] ** : 1.029082
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006052 | Grad Max: 0.134945
  -> Layer: shared_layers.0.bias | Grad Mean: 0.367893 | Grad Max: 1.830024
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001964 | Grad Max: 0.008417
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000336 | Grad Max: 0.000336
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002559 | Grad Max: 0.192549
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048747 | Grad Max: 1.058008
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000574 | Grad Max: 0.015514
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022417 | Grad Max: 0.087196
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.001115
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004920 | Grad Max: 0.012312
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000441
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001587 | Grad Max: 0.003953
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002823 | Grad Max: 0.005244
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047454 | Grad Max: 0.047454
[GRADIENT NORM TOTAL] 7.0894

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.354
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074354  0.49256462] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 276/1772 | B: 302/1746 | C: 294/1754
[LOSS Ex1] A: 0.67544 | B: 0.67704 | C: 0.66968
[LOGITS Ex2 A] Mean Abs: 1.418 | Max: 6.516
[LOSS Ex2] A: 0.27007 | B: 0.41802 | C: 0.34882
** [JOINT LOSS] ** : 1.019690
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005046 | Grad Max: 0.125624
  -> Layer: shared_layers.0.bias | Grad Mean: 0.249864 | Grad Max: 1.194422
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.008819
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007361 | Grad Max: 0.007361
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001824 | Grad Max: 0.154900
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034036 | Grad Max: 0.865000
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.010592
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015525 | Grad Max: 0.056564
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000768
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003443 | Grad Max: 0.007925
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000329
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001125 | Grad Max: 0.002770
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002089 | Grad Max: 0.004718
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034591 | Grad Max: 0.034591
[GRADIENT NORM TOTAL] 4.8767

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.333
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5030571  0.49694294] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 271/1777 | B: 296/1752 | C: 257/1791
[LOSS Ex1] A: 0.67465 | B: 0.67457 | C: 0.67204
[LOGITS Ex2 A] Mean Abs: 1.363 | Max: 5.103
[LOSS Ex2] A: 0.24657 | B: 0.40765 | C: 0.36951
** [JOINT LOSS] ** : 1.014997
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002818 | Grad Max: 0.069635
  -> Layer: shared_layers.0.bias | Grad Mean: 0.157785 | Grad Max: 0.813307
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.009153
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007398 | Grad Max: 0.007398
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001142 | Grad Max: 0.124177
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021317 | Grad Max: 0.703826
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000250 | Grad Max: 0.006136
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009718 | Grad Max: 0.034365
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000493
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002137 | Grad Max: 0.005599
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000217
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000684 | Grad Max: 0.001861
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001268 | Grad Max: 0.003185
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019984 | Grad Max: 0.019984
[GRADIENT NORM TOTAL] 3.2439

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.342
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.503814   0.49618596] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 264/1784 | B: 270/1586 | C: 161/1215
[LOSS Ex1] A: 0.67375 | B: 0.67698 | C: 0.67302
[LOGITS Ex2 A] Mean Abs: 1.354 | Max: 5.725
[LOSS Ex2] A: 0.27833 | B: 0.39712 | C: 0.38267
** [JOINT LOSS] ** : 1.027295
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003089 | Grad Max: 0.086863
  -> Layer: shared_layers.0.bias | Grad Mean: 0.244588 | Grad Max: 1.171963
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002013 | Grad Max: 0.009309
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008512 | Grad Max: 0.008512
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001605 | Grad Max: 0.145007
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030224 | Grad Max: 0.822676
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000362 | Grad Max: 0.009078
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014290 | Grad Max: 0.050467
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000698
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003133 | Grad Max: 0.007120
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000336
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001006 | Grad Max: 0.002589
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001757 | Grad Max: 0.003343
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029649 | Grad Max: 0.029649
[GRADIENT NORM TOTAL] 4.7026

[EPOCH SUMMARY] Train Loss: 1.0223

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9924 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9974 -> New: 0.9924)

############################## EPOCH 43/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.291
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5066921  0.49330786] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 243/1805 | B: 279/1769 | C: 269/1779
[LOSS Ex1] A: 0.67706 | B: 0.67652 | C: 0.67198
[LOGITS Ex2 A] Mean Abs: 1.354 | Max: 5.523
[LOSS Ex2] A: 0.25059 | B: 0.42918 | C: 0.36959
** [JOINT LOSS] ** : 1.024970
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002951 | Grad Max: 0.079720
  -> Layer: shared_layers.0.bias | Grad Mean: 0.122075 | Grad Max: 0.578684
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001824 | Grad Max: 0.006836
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003978 | Grad Max: 0.003978
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000967 | Grad Max: 0.116589
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017709 | Grad Max: 0.660132
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000212 | Grad Max: 0.005529
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008220 | Grad Max: 0.031608
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000548
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001823 | Grad Max: 0.004733
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000212
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000594 | Grad Max: 0.001613
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001302 | Grad Max: 0.003032
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018783 | Grad Max: 0.018783
[GRADIENT NORM TOTAL] 2.5501

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.201
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51673365 0.48326635] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 247/1801 | B: 303/1745 | C: 260/1788
[LOSS Ex1] A: 0.67660 | B: 0.67695 | C: 0.67149
[LOGITS Ex2 A] Mean Abs: 1.368 | Max: 5.201
[LOSS Ex2] A: 0.28344 | B: 0.43096 | C: 0.37909
** [JOINT LOSS] ** : 1.039511
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005472 | Grad Max: 0.160780
  -> Layer: shared_layers.0.bias | Grad Mean: 0.309657 | Grad Max: 1.508895
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001889 | Grad Max: 0.007485
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004584 | Grad Max: 0.004584
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.165365
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039824 | Grad Max: 0.924650
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000475 | Grad Max: 0.013028
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018620 | Grad Max: 0.070850
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000849
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004089 | Grad Max: 0.009083
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000367
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001319 | Grad Max: 0.003312
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002388 | Grad Max: 0.004630
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039238 | Grad Max: 0.039238
[GRADIENT NORM TOTAL] 5.9282

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.319
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5661755  0.43382445] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 271/1777 | B: 297/1751 | C: 241/1807
[LOSS Ex1] A: 0.67485 | B: 0.67448 | C: 0.67279
[LOGITS Ex2 A] Mean Abs: 1.410 | Max: 5.422
[LOSS Ex2] A: 0.27022 | B: 0.42949 | C: 0.38335
** [JOINT LOSS] ** : 1.035055
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007833 | Grad Max: 0.194120
  -> Layer: shared_layers.0.bias | Grad Mean: 0.430691 | Grad Max: 2.150725
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.008956
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009239 | Grad Max: 0.009239
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003110 | Grad Max: 0.253180
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058288 | Grad Max: 1.407096
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000687 | Grad Max: 0.017135
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026791 | Grad Max: 0.097624
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001235
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005929 | Grad Max: 0.013360
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000519
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001925 | Grad Max: 0.004514
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003682 | Grad Max: 0.006738
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.059106 | Grad Max: 0.059106
[GRADIENT NORM TOTAL] 8.4564

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.356
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5024189  0.49758106] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 256/1792 | B: 270/1586 | C: 245/1803
[LOSS Ex1] A: 0.67746 | B: 0.67690 | C: 0.67210
[LOGITS Ex2 A] Mean Abs: 1.400 | Max: 4.967
[LOSS Ex2] A: 0.26817 | B: 0.39925 | C: 0.34570
** [JOINT LOSS] ** : 1.013193
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003680 | Grad Max: 0.112252
  -> Layer: shared_layers.0.bias | Grad Mean: 0.255038 | Grad Max: 1.315869
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001937 | Grad Max: 0.008546
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010209 | Grad Max: 0.010209
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001800 | Grad Max: 0.167749
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034462 | Grad Max: 0.935517
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000422 | Grad Max: 0.011430
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016768 | Grad Max: 0.059249
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000769
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003674 | Grad Max: 0.008054
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000341
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001205 | Grad Max: 0.003150
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002236 | Grad Max: 0.004949
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037738 | Grad Max: 0.037738
[GRADIENT NORM TOTAL] 5.0867

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.170
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5617827 0.4382173] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 265/1783 | B: 282/1766 | C: 262/1786
[LOSS Ex1] A: 0.67594 | B: 0.67643 | C: 0.67142
[LOGITS Ex2 A] Mean Abs: 1.360 | Max: 5.122
[LOSS Ex2] A: 0.26887 | B: 0.43300 | C: 0.35661
** [JOINT LOSS] ** : 1.027426
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003716 | Grad Max: 0.098585
  -> Layer: shared_layers.0.bias | Grad Mean: 0.153273 | Grad Max: 0.712122
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001981 | Grad Max: 0.008522
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006845 | Grad Max: 0.006845
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001123 | Grad Max: 0.164609
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020762 | Grad Max: 0.924113
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000231 | Grad Max: 0.006604
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008864 | Grad Max: 0.036241
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000567
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001978 | Grad Max: 0.005204
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000207
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000634 | Grad Max: 0.001756
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001202 | Grad Max: 0.002530
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018372 | Grad Max: 0.018372
[GRADIENT NORM TOTAL] 3.1231

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.226
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53736824 0.46263176] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 221/1395 | B: 304/1744 | C: 274/1774
[LOSS Ex1] A: 0.67501 | B: 0.67687 | C: 0.67060
[LOGITS Ex2 A] Mean Abs: 1.369 | Max: 5.277
[LOSS Ex2] A: 0.25428 | B: 0.43338 | C: 0.36876
** [JOINT LOSS] ** : 1.026299
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006058 | Grad Max: 0.133969
  -> Layer: shared_layers.0.bias | Grad Mean: 0.328102 | Grad Max: 1.614570
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.007982
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001877 | Grad Max: 0.001877
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002354 | Grad Max: 0.250919
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044153 | Grad Max: 1.395669
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000523 | Grad Max: 0.013465
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020381 | Grad Max: 0.071994
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.001074
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004517 | Grad Max: 0.011435
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000403
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001469 | Grad Max: 0.003584
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002796 | Grad Max: 0.005066
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045254 | Grad Max: 0.045254
[GRADIENT NORM TOTAL] 6.4338

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.357
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074517 0.4925483] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 282/1766 | B: 299/1749 | C: 254/1794
[LOSS Ex1] A: 0.67525 | B: 0.67439 | C: 0.67315
[LOGITS Ex2 A] Mean Abs: 1.387 | Max: 5.610
[LOSS Ex2] A: 0.25586 | B: 0.41129 | C: 0.35814
** [JOINT LOSS] ** : 1.016028
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003504 | Grad Max: 0.076814
  -> Layer: shared_layers.0.bias | Grad Mean: 0.140381 | Grad Max: 0.701833
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.009409
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015277 | Grad Max: 0.015277
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001025 | Grad Max: 0.069544
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019118 | Grad Max: 0.382909
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000231 | Grad Max: 0.005851
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008925 | Grad Max: 0.032916
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000457
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001983 | Grad Max: 0.004860
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000230
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000636 | Grad Max: 0.001844
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001276 | Grad Max: 0.002840
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019472 | Grad Max: 0.019472
[GRADIENT NORM TOTAL] 2.6492

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.335
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5030843 0.4969157] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 275/1773 | B: 272/1584 | C: 256/1792
[LOSS Ex1] A: 0.67445 | B: 0.67682 | C: 0.67160
[LOGITS Ex2 A] Mean Abs: 1.396 | Max: 5.106
[LOSS Ex2] A: 0.26141 | B: 0.39613 | C: 0.35134
** [JOINT LOSS] ** : 1.010583
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005455 | Grad Max: 0.125990
  -> Layer: shared_layers.0.bias | Grad Mean: 0.268974 | Grad Max: 1.327690
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.009384
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011663 | Grad Max: 0.011663
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001938 | Grad Max: 0.170389
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036571 | Grad Max: 0.938565
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000432 | Grad Max: 0.010165
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016936 | Grad Max: 0.057904
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000847
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003738 | Grad Max: 0.009464
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000355
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001214 | Grad Max: 0.003026
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002163 | Grad Max: 0.005048
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036970 | Grad Max: 0.036970
[GRADIENT NORM TOTAL] 5.2568

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5038058  0.49619418] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 274/1774 | B: 283/1765 | C: 252/1796
[LOSS Ex1] A: 0.67356 | B: 0.67635 | C: 0.67253
[LOGITS Ex2 A] Mean Abs: 1.426 | Max: 5.995
[LOSS Ex2] A: 0.28927 | B: 0.43907 | C: 0.37773
** [JOINT LOSS] ** : 1.042833
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008052 | Grad Max: 0.259024
  -> Layer: shared_layers.0.bias | Grad Mean: 0.340459 | Grad Max: 1.586569
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.009560
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009880 | Grad Max: 0.009880
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002545 | Grad Max: 0.200083
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047109 | Grad Max: 1.064484
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000550 | Grad Max: 0.012004
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021288 | Grad Max: 0.068403
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.001003
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004763 | Grad Max: 0.010079
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000442
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001540 | Grad Max: 0.003706
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002912 | Grad Max: 0.005224
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047192 | Grad Max: 0.047192
[GRADIENT NORM TOTAL] 6.5604

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.293
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5066951  0.49330494] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.021
[MASKS] A(Pass/Fail): 250/1798 | B: 306/1742 | C: 254/1794
[LOSS Ex1] A: 0.67689 | B: 0.67678 | C: 0.67043
[LOGITS Ex2 A] Mean Abs: 1.368 | Max: 5.048
[LOSS Ex2] A: 0.25541 | B: 0.41976 | C: 0.35367
** [JOINT LOSS] ** : 1.017642
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004365 | Grad Max: 0.086541
  -> Layer: shared_layers.0.bias | Grad Mean: 0.228516 | Grad Max: 1.125059
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001902 | Grad Max: 0.007729
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003344 | Grad Max: 0.003344
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001626 | Grad Max: 0.137325
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030674 | Grad Max: 0.772786
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.009477
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014144 | Grad Max: 0.053398
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000636
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003117 | Grad Max: 0.006771
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000297
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001022 | Grad Max: 0.002494
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001773 | Grad Max: 0.004331
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030790 | Grad Max: 0.030790
[GRADIENT NORM TOTAL] 4.4894

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.202
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51688373 0.48311627] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.020
[MASKS] A(Pass/Fail): 257/1791 | B: 300/1748 | C: 258/1790
[LOSS Ex1] A: 0.67642 | B: 0.67430 | C: 0.67200
[LOGITS Ex2 A] Mean Abs: 1.316 | Max: 5.458
[LOSS Ex2] A: 0.27126 | B: 0.41224 | C: 0.35883
** [JOINT LOSS] ** : 1.021685
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003560 | Grad Max: 0.116675
  -> Layer: shared_layers.0.bias | Grad Mean: 0.269759 | Grad Max: 1.426335
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001938 | Grad Max: 0.007976
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001862 | Grad Max: 0.001862
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001872 | Grad Max: 0.144116
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035414 | Grad Max: 0.813111
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.010910
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017002 | Grad Max: 0.062045
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000851
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003730 | Grad Max: 0.008863
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000315
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001200 | Grad Max: 0.002943
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002140 | Grad Max: 0.004118
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035746 | Grad Max: 0.035746
[GRADIENT NORM TOTAL] 5.3139

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.322
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5669801 0.4330199] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.021
[MASKS] A(Pass/Fail): 274/1774 | B: 273/1583 | C: 235/1813
[LOSS Ex1] A: 0.67465 | B: 0.67673 | C: 0.67526
[LOGITS Ex2 A] Mean Abs: 1.338 | Max: 5.417
[LOSS Ex2] A: 0.25177 | B: 0.41126 | C: 0.34666
** [JOINT LOSS] ** : 1.012110
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005366 | Grad Max: 0.120533
  -> Layer: shared_layers.0.bias | Grad Mean: 0.353358 | Grad Max: 1.737320
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002067 | Grad Max: 0.009447
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017989 | Grad Max: 0.017989
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002403 | Grad Max: 0.201455
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045920 | Grad Max: 1.135480
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000555 | Grad Max: 0.013514
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021909 | Grad Max: 0.077363
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.001021
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004805 | Grad Max: 0.011059
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000446
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001533 | Grad Max: 0.003930
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002632 | Grad Max: 0.005222
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044081 | Grad Max: 0.044081
[GRADIENT NORM TOTAL] 6.7998

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.359
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023944  0.49760556] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 269/1779 | B: 284/1764 | C: 274/1774
[LOSS Ex1] A: 0.67730 | B: 0.67626 | C: 0.67316
[LOGITS Ex2 A] Mean Abs: 1.368 | Max: 5.265
[LOSS Ex2] A: 0.24454 | B: 0.43325 | C: 0.33926
** [JOINT LOSS] ** : 1.014588
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003844 | Grad Max: 0.098388
  -> Layer: shared_layers.0.bias | Grad Mean: 0.196788 | Grad Max: 1.007794
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001936 | Grad Max: 0.008163
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012444 | Grad Max: 0.012444
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001412 | Grad Max: 0.141250
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026194 | Grad Max: 0.776575
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000301 | Grad Max: 0.008764
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011820 | Grad Max: 0.046803
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000602
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002637 | Grad Max: 0.006604
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000242
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000851 | Grad Max: 0.002167
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001681 | Grad Max: 0.003663
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026126 | Grad Max: 0.026126
[GRADIENT NORM TOTAL] 3.9258

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.172
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.56246275 0.43753725] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 272/1776 | B: 306/1742 | C: 167/1209
[LOSS Ex1] A: 0.67576 | B: 0.67670 | C: 0.67450
[LOGITS Ex2 A] Mean Abs: 1.388 | Max: 5.251
[LOSS Ex2] A: 0.27255 | B: 0.41973 | C: 0.34524
** [JOINT LOSS] ** : 1.021495
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002520 | Grad Max: 0.070758
  -> Layer: shared_layers.0.bias | Grad Mean: 0.194652 | Grad Max: 0.945777
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001904 | Grad Max: 0.008367
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007076 | Grad Max: 0.007076
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001333 | Grad Max: 0.127047
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024845 | Grad Max: 0.721132
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000290 | Grad Max: 0.007369
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011429 | Grad Max: 0.042497
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000582
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002512 | Grad Max: 0.006295
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000253
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000822 | Grad Max: 0.002051
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001447 | Grad Max: 0.003715
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024500 | Grad Max: 0.024500
[GRADIENT NORM TOTAL] 3.8784

[EPOCH SUMMARY] Train Loss: 1.0231

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9993 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 44/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.229
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5378378  0.46216223] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 224/1392 | B: 303/1745 | C: 253/1795
[LOSS Ex1] A: 0.67483 | B: 0.67421 | C: 0.67385
[LOGITS Ex2 A] Mean Abs: 1.453 | Max: 5.693
[LOSS Ex2] A: 0.25450 | B: 0.41173 | C: 0.35859
** [JOINT LOSS] ** : 1.015903
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005083 | Grad Max: 0.116040
  -> Layer: shared_layers.0.bias | Grad Mean: 0.312213 | Grad Max: 1.579054
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002011 | Grad Max: 0.009108
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007810 | Grad Max: 0.007810
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.159567
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041079 | Grad Max: 0.864758
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000483 | Grad Max: 0.011965
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019050 | Grad Max: 0.068729
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000886
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004206 | Grad Max: 0.009339
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000428
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001370 | Grad Max: 0.003505
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002485 | Grad Max: 0.005232
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042010 | Grad Max: 0.042010
[GRADIENT NORM TOTAL] 6.0523

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.360
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073994  0.49260062] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 288/1760 | B: 273/1583 | C: 262/1786
[LOSS Ex1] A: 0.67507 | B: 0.67665 | C: 0.67245
[LOGITS Ex2 A] Mean Abs: 1.416 | Max: 6.254
[LOSS Ex2] A: 0.25181 | B: 0.39213 | C: 0.35256
** [JOINT LOSS] ** : 1.006893
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002941 | Grad Max: 0.058209
  -> Layer: shared_layers.0.bias | Grad Mean: 0.138921 | Grad Max: 0.722374
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.009176
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012438 | Grad Max: 0.012438
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001014 | Grad Max: 0.108670
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018897 | Grad Max: 0.614398
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000219 | Grad Max: 0.005743
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008579 | Grad Max: 0.031882
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000419
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001902 | Grad Max: 0.004603
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000204
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000629 | Grad Max: 0.001703
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001148 | Grad Max: 0.003563
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019984 | Grad Max: 0.019984
[GRADIENT NORM TOTAL] 2.7780

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.338
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50319266 0.4968073 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 283/1765 | B: 284/1764 | C: 266/1782
[LOSS Ex1] A: 0.67426 | B: 0.67617 | C: 0.67092
[LOGITS Ex2 A] Mean Abs: 1.371 | Max: 5.308
[LOSS Ex2] A: 0.25009 | B: 0.42642 | C: 0.36296
** [JOINT LOSS] ** : 1.020274
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005246 | Grad Max: 0.108249
  -> Layer: shared_layers.0.bias | Grad Mean: 0.292851 | Grad Max: 1.400569
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.009161
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010366 | Grad Max: 0.010366
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002037 | Grad Max: 0.192296
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038383 | Grad Max: 1.103268
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000447 | Grad Max: 0.011665
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017485 | Grad Max: 0.063907
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000818
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003860 | Grad Max: 0.008552
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000359
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001248 | Grad Max: 0.003020
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002414 | Grad Max: 0.004422
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038175 | Grad Max: 0.038175
[GRADIENT NORM TOTAL] 5.6127

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.347
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50373065 0.4962694 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 283/1765 | B: 306/1742 | C: 249/1799
[LOSS Ex1] A: 0.67336 | B: 0.67661 | C: 0.67342
[LOGITS Ex2 A] Mean Abs: 1.350 | Max: 6.103
[LOSS Ex2] A: 0.27520 | B: 0.43690 | C: 0.36550
** [JOINT LOSS] ** : 1.033664
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005557 | Grad Max: 0.152910
  -> Layer: shared_layers.0.bias | Grad Mean: 0.391826 | Grad Max: 1.957836
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.008917
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010393 | Grad Max: 0.010393
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002662 | Grad Max: 0.218444
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050725 | Grad Max: 1.259843
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000601 | Grad Max: 0.015108
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023802 | Grad Max: 0.087808
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001109
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005216 | Grad Max: 0.012148
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000452
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001679 | Grad Max: 0.004009
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003002 | Grad Max: 0.005566
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.049889 | Grad Max: 0.049889
[GRADIENT NORM TOTAL] 7.5802

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.295
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5066146  0.49338534] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.021
[MASKS] A(Pass/Fail): 258/1790 | B: 303/1745 | C: 269/1779
[LOSS Ex1] A: 0.67674 | B: 0.67412 | C: 0.66988
[LOGITS Ex2 A] Mean Abs: 1.348 | Max: 5.113
[LOSS Ex2] A: 0.25316 | B: 0.41712 | C: 0.36839
** [JOINT LOSS] ** : 1.019799
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003726 | Grad Max: 0.096601
  -> Layer: shared_layers.0.bias | Grad Mean: 0.180531 | Grad Max: 0.867577
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.007368
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001185 | Grad Max: 0.001185
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001301 | Grad Max: 0.120370
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024289 | Grad Max: 0.676916
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000286 | Grad Max: 0.009067
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011207 | Grad Max: 0.045206
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000577
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002496 | Grad Max: 0.005781
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000271
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000802 | Grad Max: 0.002161
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001578 | Grad Max: 0.003309
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023977 | Grad Max: 0.023977
[GRADIENT NORM TOTAL] 3.4991

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.204
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5171022  0.48289782] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.021
[MASKS] A(Pass/Fail): 265/1783 | B: 273/1583 | C: 263/1785
[LOSS Ex1] A: 0.67628 | B: 0.67656 | C: 0.67283
[LOGITS Ex2 A] Mean Abs: 1.359 | Max: 5.129
[LOSS Ex2] A: 0.26623 | B: 0.39599 | C: 0.35871
** [JOINT LOSS] ** : 1.015533
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004610 | Grad Max: 0.114234
  -> Layer: shared_layers.0.bias | Grad Mean: 0.209444 | Grad Max: 1.037751
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001798 | Grad Max: 0.007805
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000801 | Grad Max: 0.000801
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001503 | Grad Max: 0.129605
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028035 | Grad Max: 0.695996
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.008561
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012974 | Grad Max: 0.050246
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000601
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002891 | Grad Max: 0.006848
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000283
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000940 | Grad Max: 0.002529
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001615 | Grad Max: 0.004051
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027667 | Grad Max: 0.027667
[GRADIENT NORM TOTAL] 4.0473

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.324
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.56782967 0.43217036] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 284/1764 | B: 286/1762 | C: 241/1807
[LOSS Ex1] A: 0.67447 | B: 0.67608 | C: 0.67325
[LOGITS Ex2 A] Mean Abs: 1.427 | Max: 5.249
[LOSS Ex2] A: 0.26191 | B: 0.43631 | C: 0.35280
** [JOINT LOSS] ** : 1.024939
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006954 | Grad Max: 0.160299
  -> Layer: shared_layers.0.bias | Grad Mean: 0.373784 | Grad Max: 1.817933
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002145 | Grad Max: 0.009430
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017535 | Grad Max: 0.017535
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002611 | Grad Max: 0.193216
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049154 | Grad Max: 1.044224
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000582 | Grad Max: 0.013893
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022849 | Grad Max: 0.080487
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.001058
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005059 | Grad Max: 0.011310
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000435
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001650 | Grad Max: 0.003930
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003029 | Grad Max: 0.006041
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050060 | Grad Max: 0.050060
[GRADIENT NORM TOTAL] 7.1650

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.361
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022903  0.49770972] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 277/1771 | B: 306/1742 | C: 270/1778
[LOSS Ex1] A: 0.67714 | B: 0.67652 | C: 0.67050
[LOGITS Ex2 A] Mean Abs: 1.404 | Max: 5.273
[LOSS Ex2] A: 0.25343 | B: 0.41965 | C: 0.33822
** [JOINT LOSS] ** : 1.011823
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002622 | Grad Max: 0.062794
  -> Layer: shared_layers.0.bias | Grad Mean: 0.184800 | Grad Max: 0.925505
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001926 | Grad Max: 0.008337
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008297 | Grad Max: 0.008297
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001266 | Grad Max: 0.125008
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024129 | Grad Max: 0.710728
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000290 | Grad Max: 0.007644
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011465 | Grad Max: 0.045001
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000516
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002505 | Grad Max: 0.005702
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000263
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000821 | Grad Max: 0.002231
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001382 | Grad Max: 0.003939
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024169 | Grad Max: 0.024169
[GRADIENT NORM TOTAL] 3.6749

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.174
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.56318676 0.43681327] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 280/1768 | B: 304/1744 | C: 263/1785
[LOSS Ex1] A: 0.67560 | B: 0.67403 | C: 0.67077
[LOGITS Ex2 A] Mean Abs: 1.362 | Max: 5.385
[LOSS Ex2] A: 0.26185 | B: 0.41472 | C: 0.37532
** [JOINT LOSS] ** : 1.024097
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004978 | Grad Max: 0.106427
  -> Layer: shared_layers.0.bias | Grad Mean: 0.256051 | Grad Max: 1.236791
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.008237
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001598 | Grad Max: 0.001598
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001860 | Grad Max: 0.212107
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035026 | Grad Max: 1.197060
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000413 | Grad Max: 0.009701
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016308 | Grad Max: 0.058455
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000831
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003612 | Grad Max: 0.008735
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000354
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001164 | Grad Max: 0.002936
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002055 | Grad Max: 0.003865
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033843 | Grad Max: 0.033843
[GRADIENT NORM TOTAL] 5.0603

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.231
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53832185 0.46167815] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 225/1391 | B: 274/1582 | C: 251/1797
[LOSS Ex1] A: 0.67466 | B: 0.67648 | C: 0.67261
[LOGITS Ex2 A] Mean Abs: 1.401 | Max: 5.615
[LOSS Ex2] A: 0.24778 | B: 0.41122 | C: 0.36962
** [JOINT LOSS] ** : 1.017459
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006887 | Grad Max: 0.172563
  -> Layer: shared_layers.0.bias | Grad Mean: 0.375597 | Grad Max: 1.822445
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.008975
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007150 | Grad Max: 0.007150
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002700 | Grad Max: 0.253024
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050374 | Grad Max: 1.426127
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000599 | Grad Max: 0.014385
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023549 | Grad Max: 0.084621
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001042
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005241 | Grad Max: 0.011352
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000463
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001703 | Grad Max: 0.004131
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003250 | Grad Max: 0.005786
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052325 | Grad Max: 0.052325
[GRADIENT NORM TOTAL] 7.3245

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.362
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073206  0.49267936] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 299/1749 | B: 287/1761 | C: 254/1794
[LOSS Ex1] A: 0.67490 | B: 0.67600 | C: 0.67133
[LOGITS Ex2 A] Mean Abs: 1.391 | Max: 6.943
[LOSS Ex2] A: 0.24415 | B: 0.43517 | C: 0.35157
** [JOINT LOSS] ** : 1.017711
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003389 | Grad Max: 0.081676
  -> Layer: shared_layers.0.bias | Grad Mean: 0.170885 | Grad Max: 0.915802
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.008935
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008734 | Grad Max: 0.008734
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001325 | Grad Max: 0.215941
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024509 | Grad Max: 1.218868
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000286 | Grad Max: 0.006471
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011261 | Grad Max: 0.040750
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000569
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002513 | Grad Max: 0.005960
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000235
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000804 | Grad Max: 0.002179
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001491 | Grad Max: 0.003227
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023166 | Grad Max: 0.023166
[GRADIENT NORM TOTAL] 3.6948

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.340
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5032924  0.49670762] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 290/1758 | B: 308/1740 | C: 271/1777
[LOSS Ex1] A: 0.67408 | B: 0.67644 | C: 0.67015
[LOGITS Ex2 A] Mean Abs: 1.424 | Max: 5.269
[LOSS Ex2] A: 0.25091 | B: 0.42451 | C: 0.36227
** [JOINT LOSS] ** : 1.019452
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005047 | Grad Max: 0.115706
  -> Layer: shared_layers.0.bias | Grad Mean: 0.255155 | Grad Max: 1.212954
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.009222
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008801 | Grad Max: 0.008801
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001774 | Grad Max: 0.166766
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033567 | Grad Max: 0.896113
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000396 | Grad Max: 0.009545
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015529 | Grad Max: 0.057359
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000832
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003418 | Grad Max: 0.008288
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000357
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001097 | Grad Max: 0.002788
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001894 | Grad Max: 0.004237
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031582 | Grad Max: 0.031582
[GRADIENT NORM TOTAL] 4.9123

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.349
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5036386 0.4963614] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 294/1754 | B: 306/1742 | C: 231/1817
[LOSS Ex1] A: 0.67318 | B: 0.67393 | C: 0.67197
[LOGITS Ex2 A] Mean Abs: 1.437 | Max: 5.913
[LOSS Ex2] A: 0.27640 | B: 0.42214 | C: 0.36433
** [JOINT LOSS] ** : 1.027318
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007222 | Grad Max: 0.209688
  -> Layer: shared_layers.0.bias | Grad Mean: 0.339877 | Grad Max: 1.616648
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002107 | Grad Max: 0.008099
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002559 | Grad Max: 0.002559
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002469 | Grad Max: 0.206477
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045944 | Grad Max: 1.071246
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000532 | Grad Max: 0.012984
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020850 | Grad Max: 0.071930
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000958
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004657 | Grad Max: 0.011068
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000404
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001512 | Grad Max: 0.003603
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002839 | Grad Max: 0.005235
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045712 | Grad Max: 0.045712
[GRADIENT NORM TOTAL] 6.5483

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50654405 0.49345592] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.519 | Std: 0.021
[MASKS] A(Pass/Fail): 263/1785 | B: 275/1581 | C: 162/1214
[LOSS Ex1] A: 0.67658 | B: 0.67640 | C: 0.67438
[LOGITS Ex2 A] Mean Abs: 1.390 | Max: 5.419
[LOSS Ex2] A: 0.25223 | B: 0.40328 | C: 0.35823
** [JOINT LOSS] ** : 1.013701
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002801 | Grad Max: 0.057848
  -> Layer: shared_layers.0.bias | Grad Mean: 0.185146 | Grad Max: 0.813883
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001798 | Grad Max: 0.007413
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005241 | Grad Max: 0.005241
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001269 | Grad Max: 0.118703
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023855 | Grad Max: 0.666920
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000277 | Grad Max: 0.007695
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011151 | Grad Max: 0.047841
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000649
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002467 | Grad Max: 0.005877
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000261
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000809 | Grad Max: 0.002107
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001416 | Grad Max: 0.003828
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024632 | Grad Max: 0.024632
[GRADIENT NORM TOTAL] 3.6074

[EPOCH SUMMARY] Train Loss: 1.0192

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9926 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 45/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.205
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5172878 0.4827122] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.519 | Std: 0.021
[MASKS] A(Pass/Fail): 270/1778 | B: 290/1758 | C: 268/1780
[LOSS Ex1] A: 0.67612 | B: 0.67591 | C: 0.67102
[LOGITS Ex2 A] Mean Abs: 1.319 | Max: 5.583
[LOSS Ex2] A: 0.25801 | B: 0.43564 | C: 0.35019
** [JOINT LOSS] ** : 1.022298
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004215 | Grad Max: 0.098572
  -> Layer: shared_layers.0.bias | Grad Mean: 0.251763 | Grad Max: 1.269976
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001880 | Grad Max: 0.008039
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002330 | Grad Max: 0.002330
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001733 | Grad Max: 0.172768
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032721 | Grad Max: 0.991341
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.010386
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015295 | Grad Max: 0.061105
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000779
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003378 | Grad Max: 0.007820
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000321
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001088 | Grad Max: 0.002809
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002083 | Grad Max: 0.004047
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032596 | Grad Max: 0.032596
[GRADIENT NORM TOTAL] 5.0108

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.326
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.56867164 0.43132836] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 297/1751 | B: 310/1738 | C: 266/1782
[LOSS Ex1] A: 0.67429 | B: 0.67636 | C: 0.67139
[LOGITS Ex2 A] Mean Abs: 1.372 | Max: 5.176
[LOSS Ex2] A: 0.24117 | B: 0.43063 | C: 0.35251
** [JOINT LOSS] ** : 1.015449
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004670 | Grad Max: 0.119356
  -> Layer: shared_layers.0.bias | Grad Mean: 0.317058 | Grad Max: 1.596569
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.009399
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013945 | Grad Max: 0.013945
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.179790
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040420 | Grad Max: 1.033652
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000475 | Grad Max: 0.010916
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018872 | Grad Max: 0.066608
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000918
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004146 | Grad Max: 0.009244
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000394
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001328 | Grad Max: 0.003347
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002391 | Grad Max: 0.004545
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038884 | Grad Max: 0.038884
[GRADIENT NORM TOTAL] 6.1083

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.364
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022282  0.49777183] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 284/1764 | B: 310/1738 | C: 277/1771
[LOSS Ex1] A: 0.67699 | B: 0.67384 | C: 0.67001
[LOGITS Ex2 A] Mean Abs: 1.388 | Max: 5.058
[LOSS Ex2] A: 0.24934 | B: 0.40255 | C: 0.35273
** [JOINT LOSS] ** : 1.008489
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003633 | Grad Max: 0.091492
  -> Layer: shared_layers.0.bias | Grad Mean: 0.150032 | Grad Max: 0.687604
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001971 | Grad Max: 0.007895
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004663 | Grad Max: 0.004663
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001053 | Grad Max: 0.101174
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019229 | Grad Max: 0.563323
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000216 | Grad Max: 0.005177
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008510 | Grad Max: 0.029463
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000436
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001952 | Grad Max: 0.004831
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000211
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000622 | Grad Max: 0.001749
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001212 | Grad Max: 0.003418
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018407 | Grad Max: 0.018407
[GRADIENT NORM TOTAL] 2.8285

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.175
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5638724  0.43612763] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 287/1761 | B: 276/1580 | C: 239/1809
[LOSS Ex1] A: 0.67543 | B: 0.67631 | C: 0.67337
[LOGITS Ex2 A] Mean Abs: 1.424 | Max: 5.341
[LOSS Ex2] A: 0.27251 | B: 0.39728 | C: 0.37043
** [JOINT LOSS] ** : 1.021776
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004780 | Grad Max: 0.098255
  -> Layer: shared_layers.0.bias | Grad Mean: 0.226698 | Grad Max: 1.146889
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001827 | Grad Max: 0.007781
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003619 | Grad Max: 0.003619
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001611 | Grad Max: 0.105147
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030089 | Grad Max: 0.596413
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000356 | Grad Max: 0.009537
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013985 | Grad Max: 0.051011
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000649
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003094 | Grad Max: 0.007253
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000280
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001003 | Grad Max: 0.002444
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001901 | Grad Max: 0.004105
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030545 | Grad Max: 0.030545
[GRADIENT NORM TOTAL] 4.3652

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.233
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5387793  0.46122068] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 229/1387 | B: 291/1757 | C: 262/1786
[LOSS Ex1] A: 0.67449 | B: 0.67582 | C: 0.67081
[LOGITS Ex2 A] Mean Abs: 1.467 | Max: 5.325
[LOSS Ex2] A: 0.24818 | B: 0.43560 | C: 0.33529
** [JOINT LOSS] ** : 1.013395
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005017 | Grad Max: 0.110944
  -> Layer: shared_layers.0.bias | Grad Mean: 0.277218 | Grad Max: 1.376778
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.008223
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001014 | Grad Max: 0.001014
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001899 | Grad Max: 0.136014
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036064 | Grad Max: 0.714299
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000425 | Grad Max: 0.009987
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016811 | Grad Max: 0.060596
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000781
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003716 | Grad Max: 0.008453
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000350
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001196 | Grad Max: 0.002951
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002094 | Grad Max: 0.004766
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035697 | Grad Max: 0.035697
[GRADIENT NORM TOTAL] 5.2634

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.365
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072783  0.49272168] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 311/1737 | B: 310/1738 | C: 271/1777
[LOSS Ex1] A: 0.67472 | B: 0.67626 | C: 0.66999
[LOGITS Ex2 A] Mean Abs: 1.425 | Max: 6.859
[LOSS Ex2] A: 0.26198 | B: 0.42402 | C: 0.33133
** [JOINT LOSS] ** : 1.012771
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002676 | Grad Max: 0.058524
  -> Layer: shared_layers.0.bias | Grad Mean: 0.121225 | Grad Max: 0.571884
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001985 | Grad Max: 0.008311
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005951 | Grad Max: 0.005951
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000852 | Grad Max: 0.090465
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015485 | Grad Max: 0.496991
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000171 | Grad Max: 0.005441
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006708 | Grad Max: 0.027822
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000442
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001496 | Grad Max: 0.003816
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000187
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000492 | Grad Max: 0.001439
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000831 | Grad Max: 0.002646
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014250 | Grad Max: 0.014250
[GRADIENT NORM TOTAL] 2.3998

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.342
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.503341   0.49665898] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 307/1741 | B: 313/1735 | C: 243/1805
[LOSS Ex1] A: 0.67388 | B: 0.67374 | C: 0.67163
[LOGITS Ex2 A] Mean Abs: 1.377 | Max: 5.350
[LOSS Ex2] A: 0.25314 | B: 0.41391 | C: 0.36026
** [JOINT LOSS] ** : 1.015521
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004596 | Grad Max: 0.105901
  -> Layer: shared_layers.0.bias | Grad Mean: 0.300405 | Grad Max: 1.397904
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.009351
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009870 | Grad Max: 0.009870
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.175927
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037469 | Grad Max: 0.999396
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000447 | Grad Max: 0.012439
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017787 | Grad Max: 0.068812
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000826
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003918 | Grad Max: 0.008572
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000324
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001264 | Grad Max: 0.002979
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002409 | Grad Max: 0.004284
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037915 | Grad Max: 0.037915
[GRADIENT NORM TOTAL] 5.6711

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.352
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50359046 0.49640954] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 295/1753 | B: 277/1579 | C: 258/1790
[LOSS Ex1] A: 0.67298 | B: 0.67622 | C: 0.67198
[LOGITS Ex2 A] Mean Abs: 1.362 | Max: 6.043
[LOSS Ex2] A: 0.27259 | B: 0.41412 | C: 0.35946
** [JOINT LOSS] ** : 1.022453
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006318 | Grad Max: 0.157675
  -> Layer: shared_layers.0.bias | Grad Mean: 0.424076 | Grad Max: 2.048934
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.008871
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006531 | Grad Max: 0.006531
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002796 | Grad Max: 0.222666
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053271 | Grad Max: 1.197317
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000631 | Grad Max: 0.015877
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025234 | Grad Max: 0.093247
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.001195
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005567 | Grad Max: 0.012951
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000499
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001794 | Grad Max: 0.004318
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003282 | Grad Max: 0.006186
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053190 | Grad Max: 0.053190
[GRADIENT NORM TOTAL] 8.0063

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.299
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50650173 0.4934983 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 272/1776 | B: 291/1757 | C: 248/1800
[LOSS Ex1] A: 0.67642 | B: 0.67572 | C: 0.67179
[LOGITS Ex2 A] Mean Abs: 1.371 | Max: 5.231
[LOSS Ex2] A: 0.25511 | B: 0.42846 | C: 0.36349
** [JOINT LOSS] ** : 1.023662
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005319 | Grad Max: 0.132021
  -> Layer: shared_layers.0.bias | Grad Mean: 0.253097 | Grad Max: 1.221457
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001838 | Grad Max: 0.007329
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000115 | Grad Max: 0.000115
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001850 | Grad Max: 0.176570
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034683 | Grad Max: 1.011572
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000406 | Grad Max: 0.010138
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015885 | Grad Max: 0.054005
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000723
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003528 | Grad Max: 0.007554
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000335
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001144 | Grad Max: 0.002893
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002337 | Grad Max: 0.004288
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034978 | Grad Max: 0.034978
[GRADIENT NORM TOTAL] 5.0353

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.206
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5174494  0.48255062] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 278/1770 | B: 310/1738 | C: 278/1770
[LOSS Ex1] A: 0.67595 | B: 0.67617 | C: 0.67084
[LOGITS Ex2 A] Mean Abs: 1.383 | Max: 5.632
[LOSS Ex2] A: 0.25012 | B: 0.42498 | C: 0.35776
** [JOINT LOSS] ** : 1.018607
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003141 | Grad Max: 0.075057
  -> Layer: shared_layers.0.bias | Grad Mean: 0.205128 | Grad Max: 1.029907
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001870 | Grad Max: 0.007591
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001880 | Grad Max: 0.001880
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001406 | Grad Max: 0.120069
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026368 | Grad Max: 0.682591
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000296 | Grad Max: 0.007750
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011690 | Grad Max: 0.043398
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000643
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002584 | Grad Max: 0.006291
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000264
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000839 | Grad Max: 0.002127
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001442 | Grad Max: 0.003539
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024708 | Grad Max: 0.024708
[GRADIENT NORM TOTAL] 4.0505

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.329
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5694984  0.43050155] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 307/1741 | B: 313/1735 | C: 244/1804
[LOSS Ex1] A: 0.67410 | B: 0.67364 | C: 0.67315
[LOGITS Ex2 A] Mean Abs: 1.438 | Max: 5.274
[LOSS Ex2] A: 0.25662 | B: 0.41473 | C: 0.36739
** [JOINT LOSS] ** : 1.019874
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007380 | Grad Max: 0.175377
  -> Layer: shared_layers.0.bias | Grad Mean: 0.394464 | Grad Max: 2.023442
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.009640
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.018375 | Grad Max: 0.018375
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002792 | Grad Max: 0.209177
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052876 | Grad Max: 1.116848
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000617 | Grad Max: 0.014735
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024419 | Grad Max: 0.083974
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001070
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005410 | Grad Max: 0.011620
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000492
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001748 | Grad Max: 0.004137
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003246 | Grad Max: 0.006276
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052262 | Grad Max: 0.052262
[GRADIENT NORM TOTAL] 7.6012

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.367
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022001  0.49779987] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 288/1760 | B: 279/1577 | C: 248/1800
[LOSS Ex1] A: 0.67682 | B: 0.67613 | C: 0.67196
[LOGITS Ex2 A] Mean Abs: 1.437 | Max: 5.066
[LOSS Ex2] A: 0.25629 | B: 0.40237 | C: 0.35751
** [JOINT LOSS] ** : 1.013692
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003048 | Grad Max: 0.087374
  -> Layer: shared_layers.0.bias | Grad Mean: 0.204920 | Grad Max: 1.014286
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001981 | Grad Max: 0.008909
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013838 | Grad Max: 0.013838
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001380 | Grad Max: 0.131426
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026347 | Grad Max: 0.730105
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000317 | Grad Max: 0.009215
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012673 | Grad Max: 0.052971
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000713
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002770 | Grad Max: 0.006904
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000267
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000900 | Grad Max: 0.002306
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001676 | Grad Max: 0.003853
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027774 | Grad Max: 0.027774
[GRADIENT NORM TOTAL] 3.9114

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.177
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5645635  0.43543643] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 292/1756 | B: 296/1752 | C: 259/1789
[LOSS Ex1] A: 0.67525 | B: 0.67563 | C: 0.67159
[LOGITS Ex2 A] Mean Abs: 1.384 | Max: 5.638
[LOSS Ex2] A: 0.26683 | B: 0.43003 | C: 0.38139
** [JOINT LOSS] ** : 1.033572
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003615 | Grad Max: 0.100221
  -> Layer: shared_layers.0.bias | Grad Mean: 0.251046 | Grad Max: 1.222869
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001931 | Grad Max: 0.008339
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002712 | Grad Max: 0.002712
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001744 | Grad Max: 0.236627
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032821 | Grad Max: 1.338515
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000384 | Grad Max: 0.010575
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015328 | Grad Max: 0.060855
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000740
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003400 | Grad Max: 0.007583
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000317
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001109 | Grad Max: 0.002717
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002131 | Grad Max: 0.004458
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034081 | Grad Max: 0.034081
[GRADIENT NORM TOTAL] 5.0880

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.236
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5392329  0.46076712] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 232/1384 | B: 310/1738 | C: 145/1231
[LOSS Ex1] A: 0.67430 | B: 0.67608 | C: 0.67393
[LOGITS Ex2 A] Mean Abs: 1.414 | Max: 5.590
[LOSS Ex2] A: 0.25225 | B: 0.42892 | C: 0.35771
** [JOINT LOSS] ** : 1.021063
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006223 | Grad Max: 0.143286
  -> Layer: shared_layers.0.bias | Grad Mean: 0.352821 | Grad Max: 1.773617
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001974 | Grad Max: 0.008946
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009917 | Grad Max: 0.009917
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002514 | Grad Max: 0.255859
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046884 | Grad Max: 1.435117
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000553 | Grad Max: 0.013710
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021849 | Grad Max: 0.079247
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000996
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004853 | Grad Max: 0.010558
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000451
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001574 | Grad Max: 0.003880
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003119 | Grad Max: 0.005440
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048455 | Grad Max: 0.048455
[GRADIENT NORM TOTAL] 6.9134

[EPOCH SUMMARY] Train Loss: 1.0188

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9894 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9924 -> New: 0.9894)

############################## EPOCH 46/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.368
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072664 0.4927336] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 317/1731 | B: 317/1731 | C: 269/1779
[LOSS Ex1] A: 0.67454 | B: 0.67355 | C: 0.67131
[LOGITS Ex2 A] Mean Abs: 1.405 | Max: 5.177
[LOSS Ex2] A: 0.25235 | B: 0.40569 | C: 0.37168
** [JOINT LOSS] ** : 1.016375
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004618 | Grad Max: 0.105803
  -> Layer: shared_layers.0.bias | Grad Mean: 0.188473 | Grad Max: 0.967525
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002004 | Grad Max: 0.008049
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002995 | Grad Max: 0.002995
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001339 | Grad Max: 0.085560
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025360 | Grad Max: 0.434791
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.007593
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012032 | Grad Max: 0.043556
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000574
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002666 | Grad Max: 0.005975
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000257
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000859 | Grad Max: 0.002332
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001651 | Grad Max: 0.003213
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026032 | Grad Max: 0.026032
[GRADIENT NORM TOTAL] 3.5397

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.345
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5033882  0.49661183] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 317/1731 | B: 280/1576 | C: 273/1775
[LOSS Ex1] A: 0.67370 | B: 0.67604 | C: 0.66994
[LOGITS Ex2 A] Mean Abs: 1.450 | Max: 5.198
[LOSS Ex2] A: 0.25154 | B: 0.39985 | C: 0.34133
** [JOINT LOSS] ** : 1.004133
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005498 | Grad Max: 0.122706
  -> Layer: shared_layers.0.bias | Grad Mean: 0.276671 | Grad Max: 1.352551
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.009625
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014805 | Grad Max: 0.014805
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001974 | Grad Max: 0.160366
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037314 | Grad Max: 0.877789
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000445 | Grad Max: 0.011218
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017706 | Grad Max: 0.061429
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000913
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003919 | Grad Max: 0.009880
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000355
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001269 | Grad Max: 0.003066
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002316 | Grad Max: 0.005072
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038344 | Grad Max: 0.038344
[GRADIENT NORM TOTAL] 5.3686

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.354
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50354743 0.49645257] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 304/1744 | B: 296/1752 | C: 240/1808
[LOSS Ex1] A: 0.67280 | B: 0.67554 | C: 0.67330
[LOGITS Ex2 A] Mean Abs: 1.457 | Max: 5.993
[LOSS Ex2] A: 0.28309 | B: 0.43823 | C: 0.36407
** [JOINT LOSS] ** : 1.035675
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008728 | Grad Max: 0.232486
  -> Layer: shared_layers.0.bias | Grad Mean: 0.424894 | Grad Max: 2.043810
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.009360
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011840 | Grad Max: 0.011840
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003075 | Grad Max: 0.228928
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057388 | Grad Max: 1.174939
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000669 | Grad Max: 0.016778
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026406 | Grad Max: 0.093903
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001355
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005881 | Grad Max: 0.013978
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000513
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001897 | Grad Max: 0.004686
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003439 | Grad Max: 0.006326
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.056024 | Grad Max: 0.056024
[GRADIENT NORM TOTAL] 8.1254

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.301
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5064656  0.49353442] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 279/1769 | B: 309/1739 | C: 247/1801
[LOSS Ex1] A: 0.67626 | B: 0.67599 | C: 0.67262
[LOGITS Ex2 A] Mean Abs: 1.422 | Max: 5.299
[LOSS Ex2] A: 0.25602 | B: 0.42629 | C: 0.36295
** [JOINT LOSS] ** : 1.023380
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004604 | Grad Max: 0.121041
  -> Layer: shared_layers.0.bias | Grad Mean: 0.228361 | Grad Max: 1.083431
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001811 | Grad Max: 0.007621
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003604 | Grad Max: 0.003604
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001565 | Grad Max: 0.143703
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029542 | Grad Max: 0.760579
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000346 | Grad Max: 0.009138
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013792 | Grad Max: 0.051761
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000771
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003047 | Grad Max: 0.008375
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000283
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000982 | Grad Max: 0.002373
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001663 | Grad Max: 0.003926
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028172 | Grad Max: 0.028172
[GRADIENT NORM TOTAL] 4.3458

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.208
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51759434 0.48240563] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.021
[MASKS] A(Pass/Fail): 283/1765 | B: 318/1730 | C: 264/1784
[LOSS Ex1] A: 0.67580 | B: 0.67347 | C: 0.66997
[LOGITS Ex2 A] Mean Abs: 1.336 | Max: 5.603
[LOSS Ex2] A: 0.26627 | B: 0.41522 | C: 0.34053
** [JOINT LOSS] ** : 1.013753
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003054 | Grad Max: 0.070023
  -> Layer: shared_layers.0.bias | Grad Mean: 0.213544 | Grad Max: 0.997498
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.007695
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002006 | Grad Max: 0.002006
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001440 | Grad Max: 0.147676
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027399 | Grad Max: 0.814221
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000321 | Grad Max: 0.009361
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012849 | Grad Max: 0.050959
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000577
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002812 | Grad Max: 0.006554
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000288
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000892 | Grad Max: 0.002406
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001590 | Grad Max: 0.003676
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025028 | Grad Max: 0.025028
[GRADIENT NORM TOTAL] 4.1497

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.331
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57030034 0.42969963] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 312/1736 | B: 281/1575 | C: 252/1796
[LOSS Ex1] A: 0.67392 | B: 0.67596 | C: 0.67111
[LOGITS Ex2 A] Mean Abs: 1.380 | Max: 5.511
[LOSS Ex2] A: 0.24704 | B: 0.39979 | C: 0.36876
** [JOINT LOSS] ** : 1.012194
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005299 | Grad Max: 0.113820
  -> Layer: shared_layers.0.bias | Grad Mean: 0.330200 | Grad Max: 1.589133
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.009200
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013475 | Grad Max: 0.013475
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002264 | Grad Max: 0.204533
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042764 | Grad Max: 1.181630
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000502 | Grad Max: 0.012764
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020061 | Grad Max: 0.078161
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000927
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004428 | Grad Max: 0.010491
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000417
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001417 | Grad Max: 0.003523
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002551 | Grad Max: 0.005151
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041294 | Grad Max: 0.041294
[GRADIENT NORM TOTAL] 6.3868

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.370
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50215673 0.49784324] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 290/1758 | B: 297/1751 | C: 236/1812
[LOSS Ex1] A: 0.67667 | B: 0.67546 | C: 0.67283
[LOGITS Ex2 A] Mean Abs: 1.400 | Max: 5.739
[LOSS Ex2] A: 0.24205 | B: 0.42885 | C: 0.34979
** [JOINT LOSS] ** : 1.015214
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004814 | Grad Max: 0.147848
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207582 | Grad Max: 0.991924
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001858 | Grad Max: 0.007881
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006234 | Grad Max: 0.006234
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001540 | Grad Max: 0.146271
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027808 | Grad Max: 0.823186
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.006904
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012042 | Grad Max: 0.039378
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000636
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002715 | Grad Max: 0.006385
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000271
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000878 | Grad Max: 0.002383
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001708 | Grad Max: 0.003615
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026195 | Grad Max: 0.026195
[GRADIENT NORM TOTAL] 4.0929

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.179
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5652136 0.4347864] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 295/1753 | B: 311/1737 | C: 244/1804
[LOSS Ex1] A: 0.67508 | B: 0.67591 | C: 0.67162
[LOGITS Ex2 A] Mean Abs: 1.445 | Max: 5.056
[LOSS Ex2] A: 0.25985 | B: 0.42863 | C: 0.35235
** [JOINT LOSS] ** : 1.021146
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003529 | Grad Max: 0.085187
  -> Layer: shared_layers.0.bias | Grad Mean: 0.230217 | Grad Max: 1.092062
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001917 | Grad Max: 0.008252
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006073 | Grad Max: 0.006073
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001522 | Grad Max: 0.126594
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028715 | Grad Max: 0.717857
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000334 | Grad Max: 0.008006
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013361 | Grad Max: 0.051678
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000731
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002943 | Grad Max: 0.007015
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000319
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000951 | Grad Max: 0.002442
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001663 | Grad Max: 0.003554
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028015 | Grad Max: 0.028015
[GRADIENT NORM TOTAL] 4.4132

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.238
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5396543  0.46034566] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 237/1379 | B: 322/1726 | C: 248/1800
[LOSS Ex1] A: 0.67414 | B: 0.67338 | C: 0.67104
[LOGITS Ex2 A] Mean Abs: 1.485 | Max: 5.845
[LOSS Ex2] A: 0.24344 | B: 0.40546 | C: 0.35504
** [JOINT LOSS] ** : 1.007498
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005522 | Grad Max: 0.127247
  -> Layer: shared_layers.0.bias | Grad Mean: 0.338266 | Grad Max: 1.690731
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.008307
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001396 | Grad Max: 0.001396
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002307 | Grad Max: 0.171253
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043952 | Grad Max: 0.930888
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000515 | Grad Max: 0.014357
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020572 | Grad Max: 0.077374
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000933
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004533 | Grad Max: 0.009982
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000415
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001463 | Grad Max: 0.003555
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002524 | Grad Max: 0.005302
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042983 | Grad Max: 0.042983
[GRADIENT NORM TOTAL] 6.5079

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.371
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072278 0.4927722] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 321/1727 | B: 283/1573 | C: 243/1805
[LOSS Ex1] A: 0.67437 | B: 0.67588 | C: 0.67337
[LOGITS Ex2 A] Mean Abs: 1.453 | Max: 6.612
[LOSS Ex2] A: 0.25437 | B: 0.39795 | C: 0.36614
** [JOINT LOSS] ** : 1.014029
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003003 | Grad Max: 0.062745
  -> Layer: shared_layers.0.bias | Grad Mean: 0.152884 | Grad Max: 0.763390
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.009104
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014919 | Grad Max: 0.014919
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001117 | Grad Max: 0.101677
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020937 | Grad Max: 0.574762
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000244 | Grad Max: 0.006588
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009695 | Grad Max: 0.039439
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000531
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002124 | Grad Max: 0.005960
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000202
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000691 | Grad Max: 0.001844
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001220 | Grad Max: 0.002947
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020531 | Grad Max: 0.020531
[GRADIENT NORM TOTAL] 3.0515

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.347
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034351 0.4965649] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 322/1726 | B: 297/1751 | C: 261/1787
[LOSS Ex1] A: 0.67351 | B: 0.67537 | C: 0.67190
[LOGITS Ex2 A] Mean Abs: 1.405 | Max: 6.166
[LOSS Ex2] A: 0.24846 | B: 0.43834 | C: 0.36256
** [JOINT LOSS] ** : 1.023379
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004937 | Grad Max: 0.110548
  -> Layer: shared_layers.0.bias | Grad Mean: 0.301164 | Grad Max: 1.468276
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.009173
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010930 | Grad Max: 0.010930
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002052 | Grad Max: 0.171031
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038863 | Grad Max: 0.961129
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000453 | Grad Max: 0.012218
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018174 | Grad Max: 0.069477
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000844
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004023 | Grad Max: 0.008640
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000358
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001297 | Grad Max: 0.003253
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002369 | Grad Max: 0.004371
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038181 | Grad Max: 0.038181
[GRADIENT NORM TOTAL] 5.7454

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.356
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50348884 0.49651122] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 309/1739 | B: 312/1736 | C: 265/1783
[LOSS Ex1] A: 0.67261 | B: 0.67583 | C: 0.67043
[LOGITS Ex2 A] Mean Abs: 1.369 | Max: 6.297
[LOSS Ex2] A: 0.26408 | B: 0.43337 | C: 0.34261
** [JOINT LOSS] ** : 1.019642
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006215 | Grad Max: 0.149591
  -> Layer: shared_layers.0.bias | Grad Mean: 0.399671 | Grad Max: 1.971603
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.009170
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008782 | Grad Max: 0.008782
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002673 | Grad Max: 0.239605
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051014 | Grad Max: 1.293868
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000609 | Grad Max: 0.014650
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024400 | Grad Max: 0.085105
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001045
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005353 | Grad Max: 0.012195
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000500
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001721 | Grad Max: 0.004411
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003033 | Grad Max: 0.005299
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.049857 | Grad Max: 0.049857
[GRADIENT NORM TOTAL] 7.5749

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.303
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5064067 0.4935933] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 286/1762 | B: 323/1725 | C: 282/1766
[LOSS Ex1] A: 0.67611 | B: 0.67329 | C: 0.66907
[LOGITS Ex2 A] Mean Abs: 1.373 | Max: 4.975
[LOSS Ex2] A: 0.25077 | B: 0.41359 | C: 0.35777
** [JOINT LOSS] ** : 1.013535
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004309 | Grad Max: 0.103400
  -> Layer: shared_layers.0.bias | Grad Mean: 0.250803 | Grad Max: 1.265411
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001990 | Grad Max: 0.007415
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002581 | Grad Max: 0.002581
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001719 | Grad Max: 0.107579
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032408 | Grad Max: 0.587134
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.009230
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015464 | Grad Max: 0.051707
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000739
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003408 | Grad Max: 0.007936
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000351
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001092 | Grad Max: 0.002825
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001983 | Grad Max: 0.004199
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031897 | Grad Max: 0.031897
[GRADIENT NORM TOTAL] 4.7577

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51775974 0.48224023] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 290/1758 | B: 284/1572 | C: 186/1190
[LOSS Ex1] A: 0.67565 | B: 0.67580 | C: 0.66849
[LOGITS Ex2 A] Mean Abs: 1.371 | Max: 5.331
[LOSS Ex2] A: 0.25910 | B: 0.39376 | C: 0.37233
** [JOINT LOSS] ** : 1.015040
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004745 | Grad Max: 0.107869
  -> Layer: shared_layers.0.bias | Grad Mean: 0.215026 | Grad Max: 1.015331
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001969 | Grad Max: 0.007903
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000846 | Grad Max: 0.000846
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001540 | Grad Max: 0.141289
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029052 | Grad Max: 0.760325
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000331 | Grad Max: 0.009572
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013242 | Grad Max: 0.055511
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000695
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002958 | Grad Max: 0.007031
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000276
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000964 | Grad Max: 0.002354
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001697 | Grad Max: 0.003875
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028640 | Grad Max: 0.028640
[GRADIENT NORM TOTAL] 4.1170

[EPOCH SUMMARY] Train Loss: 1.0168

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9948 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 47/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.333
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5711283  0.42887172] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.023
[MASKS] A(Pass/Fail): 320/1728 | B: 298/1750 | C: 249/1799
[LOSS Ex1] A: 0.67374 | B: 0.67528 | C: 0.67113
[LOGITS Ex2 A] Mean Abs: 1.425 | Max: 5.556
[LOSS Ex2] A: 0.24238 | B: 0.43608 | C: 0.34933
** [JOINT LOSS] ** : 1.015980
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005941 | Grad Max: 0.137127
  -> Layer: shared_layers.0.bias | Grad Mean: 0.307943 | Grad Max: 1.495190
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.009193
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014094 | Grad Max: 0.014094
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.163013
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040844 | Grad Max: 0.841758
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000478 | Grad Max: 0.011305
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018974 | Grad Max: 0.067237
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000813
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004226 | Grad Max: 0.009294
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000348
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001367 | Grad Max: 0.003233
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002407 | Grad Max: 0.005207
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040638 | Grad Max: 0.040638
[GRADIENT NORM TOTAL] 5.8835

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.372
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5020884 0.4979116] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 296/1752 | B: 314/1734 | C: 243/1805
[LOSS Ex1] A: 0.67651 | B: 0.67574 | C: 0.67258
[LOGITS Ex2 A] Mean Abs: 1.414 | Max: 5.002
[LOSS Ex2] A: 0.24437 | B: 0.42117 | C: 0.35282
** [JOINT LOSS] ** : 1.014396
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002632 | Grad Max: 0.061344
  -> Layer: shared_layers.0.bias | Grad Mean: 0.160107 | Grad Max: 0.802911
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.008564
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014701 | Grad Max: 0.014701
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001163 | Grad Max: 0.114490
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021863 | Grad Max: 0.644134
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000255 | Grad Max: 0.006352
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010108 | Grad Max: 0.039018
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000568
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002213 | Grad Max: 0.005320
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000723 | Grad Max: 0.001840
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001205 | Grad Max: 0.003318
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021076 | Grad Max: 0.021076
[GRADIENT NORM TOTAL] 3.2526

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.181
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5659208  0.43407914] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.022
[MASKS] A(Pass/Fail): 301/1747 | B: 324/1724 | C: 246/1802
[LOSS Ex1] A: 0.67491 | B: 0.67320 | C: 0.67208
[LOGITS Ex2 A] Mean Abs: 1.389 | Max: 5.275
[LOSS Ex2] A: 0.26158 | B: 0.40960 | C: 0.35774
** [JOINT LOSS] ** : 1.016368
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004582 | Grad Max: 0.098565
  -> Layer: shared_layers.0.bias | Grad Mean: 0.275415 | Grad Max: 1.342393
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001976 | Grad Max: 0.008289
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002956 | Grad Max: 0.002956
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001914 | Grad Max: 0.205696
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036241 | Grad Max: 1.163772
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.009554
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016237 | Grad Max: 0.060123
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000791
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003594 | Grad Max: 0.008225
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000306
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001155 | Grad Max: 0.002777
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002113 | Grad Max: 0.004270
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033995 | Grad Max: 0.033995
[GRADIENT NORM TOTAL] 5.4673

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.240
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5401435  0.45985645] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 250/1366 | B: 287/1569 | C: 222/1826
[LOSS Ex1] A: 0.67396 | B: 0.67571 | C: 0.67283
[LOGITS Ex2 A] Mean Abs: 1.401 | Max: 5.368
[LOSS Ex2] A: 0.24645 | B: 0.41079 | C: 0.38543
** [JOINT LOSS] ** : 1.021728
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007225 | Grad Max: 0.178924
  -> Layer: shared_layers.0.bias | Grad Mean: 0.439408 | Grad Max: 2.140693
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001908 | Grad Max: 0.008466
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004332 | Grad Max: 0.004332
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003042 | Grad Max: 0.256063
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057400 | Grad Max: 1.413489
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000671 | Grad Max: 0.015464
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026826 | Grad Max: 0.094338
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001113
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005943 | Grad Max: 0.012916
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000509
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001919 | Grad Max: 0.004731
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003550 | Grad Max: 0.006578
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057614 | Grad Max: 0.057614
[GRADIENT NORM TOTAL] 8.4282

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.373
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50716627 0.49283376] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 328/1720 | B: 298/1750 | C: 253/1795
[LOSS Ex1] A: 0.67420 | B: 0.67520 | C: 0.67141
[LOGITS Ex2 A] Mean Abs: 1.403 | Max: 6.773
[LOSS Ex2] A: 0.24504 | B: 0.43235 | C: 0.35436
** [JOINT LOSS] ** : 1.017521
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004827 | Grad Max: 0.127729
  -> Layer: shared_layers.0.bias | Grad Mean: 0.228703 | Grad Max: 1.069963
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002006 | Grad Max: 0.008787
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008341 | Grad Max: 0.008341
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001650 | Grad Max: 0.183700
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031018 | Grad Max: 1.052038
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.009146
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014262 | Grad Max: 0.052186
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000747
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003186 | Grad Max: 0.007575
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000269
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001021 | Grad Max: 0.002441
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001826 | Grad Max: 0.003526
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029403 | Grad Max: 0.029403
[GRADIENT NORM TOTAL] 4.4379

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.349
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5035439  0.49645603] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 327/1721 | B: 314/1734 | C: 269/1779
[LOSS Ex1] A: 0.67333 | B: 0.67566 | C: 0.67057
[LOGITS Ex2 A] Mean Abs: 1.416 | Max: 5.352
[LOSS Ex2] A: 0.25629 | B: 0.42988 | C: 0.36235
** [JOINT LOSS] ** : 1.022692
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003173 | Grad Max: 0.076531
  -> Layer: shared_layers.0.bias | Grad Mean: 0.185400 | Grad Max: 0.878651
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002023 | Grad Max: 0.008642
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007309 | Grad Max: 0.007309
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001228 | Grad Max: 0.113957
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022869 | Grad Max: 0.621411
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000264 | Grad Max: 0.006942
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010652 | Grad Max: 0.040383
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000547
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002373 | Grad Max: 0.005628
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000234
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000765 | Grad Max: 0.001981
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001258 | Grad Max: 0.003060
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021613 | Grad Max: 0.021613
[GRADIENT NORM TOTAL] 3.5609

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.358
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034003  0.49659967] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 318/1730 | B: 325/1723 | C: 267/1781
[LOSS Ex1] A: 0.67243 | B: 0.67310 | C: 0.67099
[LOGITS Ex2 A] Mean Abs: 1.432 | Max: 5.559
[LOSS Ex2] A: 0.27132 | B: 0.40441 | C: 0.34487
** [JOINT LOSS] ** : 1.012374
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007697 | Grad Max: 0.229520
  -> Layer: shared_layers.0.bias | Grad Mean: 0.329675 | Grad Max: 1.581736
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002090 | Grad Max: 0.008834
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005878 | Grad Max: 0.005878
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002443 | Grad Max: 0.181983
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044936 | Grad Max: 0.934587
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000509 | Grad Max: 0.012427
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020066 | Grad Max: 0.068886
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000882
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004540 | Grad Max: 0.009924
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000407
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001484 | Grad Max: 0.003803
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002733 | Grad Max: 0.005554
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044430 | Grad Max: 0.044430
[GRADIENT NORM TOTAL] 6.3086

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.305
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063333 0.4936667] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 292/1756 | B: 287/1569 | C: 281/1767
[LOSS Ex1] A: 0.67596 | B: 0.67563 | C: 0.66925
[LOGITS Ex2 A] Mean Abs: 1.405 | Max: 5.079
[LOSS Ex2] A: 0.23907 | B: 0.40066 | C: 0.34874
** [JOINT LOSS] ** : 1.003102
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002735 | Grad Max: 0.064078
  -> Layer: shared_layers.0.bias | Grad Mean: 0.127255 | Grad Max: 0.627468
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001882 | Grad Max: 0.007408
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000818 | Grad Max: 0.000818
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000879 | Grad Max: 0.093627
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016284 | Grad Max: 0.517268
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.004893
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007246 | Grad Max: 0.026702
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000407
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001602 | Grad Max: 0.004116
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000174
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000521 | Grad Max: 0.001495
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000829 | Grad Max: 0.003030
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015305 | Grad Max: 0.015305
[GRADIENT NORM TOTAL] 2.4578

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.210
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.517954   0.48204604] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 296/1752 | B: 299/1749 | C: 250/1798
[LOSS Ex1] A: 0.67549 | B: 0.67511 | C: 0.67161
[LOGITS Ex2 A] Mean Abs: 1.330 | Max: 5.506
[LOSS Ex2] A: 0.25337 | B: 0.43879 | C: 0.34701
** [JOINT LOSS] ** : 1.020459
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004409 | Grad Max: 0.109617
  -> Layer: shared_layers.0.bias | Grad Mean: 0.272121 | Grad Max: 1.393224
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001895 | Grad Max: 0.008151
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004433 | Grad Max: 0.004433
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.158829
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036255 | Grad Max: 0.903652
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000427 | Grad Max: 0.010420
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017105 | Grad Max: 0.061709
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000753
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003802 | Grad Max: 0.008513
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000341
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001221 | Grad Max: 0.002952
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002176 | Grad Max: 0.004136
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035401 | Grad Max: 0.035401
[GRADIENT NORM TOTAL] 5.4156

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.335
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5719583  0.42804173] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.023
[MASKS] A(Pass/Fail): 339/1709 | B: 315/1733 | C: 260/1788
[LOSS Ex1] A: 0.67355 | B: 0.67556 | C: 0.66997
[LOGITS Ex2 A] Mean Abs: 1.384 | Max: 5.409
[LOSS Ex2] A: 0.24085 | B: 0.42823 | C: 0.34514
** [JOINT LOSS] ** : 1.011106
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006727 | Grad Max: 0.186652
  -> Layer: shared_layers.0.bias | Grad Mean: 0.334908 | Grad Max: 1.664445
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.008799
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011413 | Grad Max: 0.011413
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002414 | Grad Max: 0.179130
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045309 | Grad Max: 1.008950
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000536 | Grad Max: 0.013988
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021370 | Grad Max: 0.082375
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000976
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004756 | Grad Max: 0.011118
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000422
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001525 | Grad Max: 0.003851
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002677 | Grad Max: 0.005452
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043801 | Grad Max: 0.043801
[GRADIENT NORM TOTAL] 6.4660

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.375
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50202525 0.49797478] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 302/1746 | B: 326/1722 | C: 239/1809
[LOSS Ex1] A: 0.67636 | B: 0.67301 | C: 0.67222
[LOGITS Ex2 A] Mean Abs: 1.391 | Max: 5.258
[LOSS Ex2] A: 0.24156 | B: 0.41169 | C: 0.36534
** [JOINT LOSS] ** : 1.013392
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004795 | Grad Max: 0.131866
  -> Layer: shared_layers.0.bias | Grad Mean: 0.155741 | Grad Max: 0.718826
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001923 | Grad Max: 0.007640
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004664 | Grad Max: 0.004664
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001167 | Grad Max: 0.078036
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021200 | Grad Max: 0.449484
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.006417
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009575 | Grad Max: 0.036347
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000458
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002162 | Grad Max: 0.004926
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000226
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000699 | Grad Max: 0.001844
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001369 | Grad Max: 0.003123
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020274 | Grad Max: 0.020274
[GRADIENT NORM TOTAL] 2.9190

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.182
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5666016  0.43339843] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 308/1740 | B: 289/1567 | C: 296/1752
[LOSS Ex1] A: 0.67475 | B: 0.67553 | C: 0.66786
[LOGITS Ex2 A] Mean Abs: 1.437 | Max: 5.701
[LOSS Ex2] A: 0.26849 | B: 0.40191 | C: 0.35160
** [JOINT LOSS] ** : 1.013382
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004889 | Grad Max: 0.115872
  -> Layer: shared_layers.0.bias | Grad Mean: 0.279501 | Grad Max: 1.405154
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002009 | Grad Max: 0.008673
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006974 | Grad Max: 0.006974
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.165337
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037456 | Grad Max: 0.916805
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000442 | Grad Max: 0.011678
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017706 | Grad Max: 0.064116
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000844
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003895 | Grad Max: 0.008377
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000361
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001268 | Grad Max: 0.003042
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002380 | Grad Max: 0.005021
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039345 | Grad Max: 0.039345
[GRADIENT NORM TOTAL] 5.4791

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.243
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5405743  0.45942572] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 259/1357 | B: 300/1748 | C: 263/1785
[LOSS Ex1] A: 0.67379 | B: 0.67501 | C: 0.67067
[LOGITS Ex2 A] Mean Abs: 1.490 | Max: 5.332
[LOSS Ex2] A: 0.25294 | B: 0.44794 | C: 0.34018
** [JOINT LOSS] ** : 1.020178
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007281 | Grad Max: 0.159832
  -> Layer: shared_layers.0.bias | Grad Mean: 0.409706 | Grad Max: 1.994413
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001976 | Grad Max: 0.008538
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002297 | Grad Max: 0.002297
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002835 | Grad Max: 0.215267
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053600 | Grad Max: 1.203458
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000629 | Grad Max: 0.017537
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025276 | Grad Max: 0.099692
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.001136
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005577 | Grad Max: 0.012054
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000474
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001792 | Grad Max: 0.004258
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003064 | Grad Max: 0.005671
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052229 | Grad Max: 0.052229
[GRADIENT NORM TOTAL] 7.8259

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.376
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50714207 0.49285793] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 332/1716 | B: 316/1732 | C: 180/1196
[LOSS Ex1] A: 0.67403 | B: 0.67547 | C: 0.66954
[LOGITS Ex2 A] Mean Abs: 1.454 | Max: 6.310
[LOSS Ex2] A: 0.26680 | B: 0.41666 | C: 0.36037
** [JOINT LOSS] ** : 1.020955
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006174 | Grad Max: 0.167416
  -> Layer: shared_layers.0.bias | Grad Mean: 0.304102 | Grad Max: 1.398295
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.008324
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006305 | Grad Max: 0.006305
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.191771
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040023 | Grad Max: 1.013485
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000458 | Grad Max: 0.011445
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018263 | Grad Max: 0.063587
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000722
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004072 | Grad Max: 0.008895
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000370
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001321 | Grad Max: 0.003278
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002346 | Grad Max: 0.004782
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038541 | Grad Max: 0.038541
[GRADIENT NORM TOTAL] 5.9161

[EPOCH SUMMARY] Train Loss: 1.0160

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9849 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9894 -> New: 0.9849)

############################## EPOCH 48/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.351
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50357544 0.49642453] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 336/1712 | B: 326/1722 | C: 262/1786
[LOSS Ex1] A: 0.67314 | B: 0.67291 | C: 0.66976
[LOGITS Ex2 A] Mean Abs: 1.417 | Max: 5.986
[LOSS Ex2] A: 0.23669 | B: 0.40817 | C: 0.35135
** [JOINT LOSS] ** : 1.004006
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001835 | Grad Max: 0.043338
  -> Layer: shared_layers.0.bias | Grad Mean: 0.121080 | Grad Max: 0.600969
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.008930
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007331 | Grad Max: 0.007331
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000856 | Grad Max: 0.082029
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015490 | Grad Max: 0.459672
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000177 | Grad Max: 0.005423
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007132 | Grad Max: 0.030806
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000400
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001567 | Grad Max: 0.004166
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000165
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000500 | Grad Max: 0.001442
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000928 | Grad Max: 0.002718
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014388 | Grad Max: 0.014388
[GRADIENT NORM TOTAL] 2.4248

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.361
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5033771 0.4966229] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 325/1723 | B: 290/1566 | C: 252/1796
[LOSS Ex1] A: 0.67224 | B: 0.67545 | C: 0.67123
[LOGITS Ex2 A] Mean Abs: 1.408 | Max: 5.765
[LOSS Ex2] A: 0.25493 | B: 0.39948 | C: 0.34442
** [JOINT LOSS] ** : 1.005916
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003111 | Grad Max: 0.086394
  -> Layer: shared_layers.0.bias | Grad Mean: 0.255682 | Grad Max: 1.178819
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.009468
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011129 | Grad Max: 0.011129
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001590 | Grad Max: 0.146565
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029698 | Grad Max: 0.842963
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000345 | Grad Max: 0.009369
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013969 | Grad Max: 0.056086
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000621
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003057 | Grad Max: 0.007283
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000312
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000966 | Grad Max: 0.002671
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001637 | Grad Max: 0.003960
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026782 | Grad Max: 0.026782
[GRADIENT NORM TOTAL] 4.8023

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.307
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50632185 0.49367815] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 298/1750 | B: 301/1747 | C: 245/1803
[LOSS Ex1] A: 0.67579 | B: 0.67492 | C: 0.67321
[LOGITS Ex2 A] Mean Abs: 1.382 | Max: 4.950
[LOSS Ex2] A: 0.24686 | B: 0.42431 | C: 0.34342
** [JOINT LOSS] ** : 1.012836
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001810 | Grad Max: 0.040354
  -> Layer: shared_layers.0.bias | Grad Mean: 0.084519 | Grad Max: 0.288024
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001845 | Grad Max: 0.007627
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006141 | Grad Max: 0.006141
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000626 | Grad Max: 0.093569
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011118 | Grad Max: 0.535572
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000115 | Grad Max: 0.003858
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004548 | Grad Max: 0.019903
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000322
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001020 | Grad Max: 0.002883
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000142
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000319 | Grad Max: 0.001003
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000634 | Grad Max: 0.002115
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008865 | Grad Max: 0.008865
[GRADIENT NORM TOTAL] 1.7811

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.211
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51810944 0.48189062] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.520 | Std: 0.022
[MASKS] A(Pass/Fail): 307/1741 | B: 316/1732 | C: 261/1787
[LOSS Ex1] A: 0.67532 | B: 0.67538 | C: 0.67099
[LOGITS Ex2 A] Mean Abs: 1.392 | Max: 5.328
[LOSS Ex2] A: 0.26394 | B: 0.42462 | C: 0.35359
** [JOINT LOSS] ** : 1.021281
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005299 | Grad Max: 0.122013
  -> Layer: shared_layers.0.bias | Grad Mean: 0.268789 | Grad Max: 1.237447
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001851 | Grad Max: 0.007817
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002149 | Grad Max: 0.002149
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001874 | Grad Max: 0.146556
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034770 | Grad Max: 0.782862
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000400 | Grad Max: 0.009730
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015989 | Grad Max: 0.053467
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000755
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003581 | Grad Max: 0.007784
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000330
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001154 | Grad Max: 0.002876
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001988 | Grad Max: 0.004326
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033410 | Grad Max: 0.033410
[GRADIENT NORM TOTAL] 5.1613

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.338
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5728628  0.42713714] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.023
[MASKS] A(Pass/Fail): 347/1701 | B: 326/1722 | C: 276/1772
[LOSS Ex1] A: 0.67335 | B: 0.67280 | C: 0.66865
[LOGITS Ex2 A] Mean Abs: 1.437 | Max: 5.715
[LOSS Ex2] A: 0.24312 | B: 0.41571 | C: 0.35149
** [JOINT LOSS] ** : 1.008374
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005161 | Grad Max: 0.133228
  -> Layer: shared_layers.0.bias | Grad Mean: 0.356174 | Grad Max: 1.595729
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.008736
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006379 | Grad Max: 0.006379
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002413 | Grad Max: 0.193680
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045372 | Grad Max: 1.099101
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.012860
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021553 | Grad Max: 0.076402
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000949
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004771 | Grad Max: 0.010573
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000387
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001547 | Grad Max: 0.003746
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002712 | Grad Max: 0.005745
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045950 | Grad Max: 0.045950
[GRADIENT NORM TOTAL] 6.8349

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.378
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.501994   0.49800596] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 316/1732 | B: 290/1566 | C: 272/1776
[LOSS Ex1] A: 0.67617 | B: 0.67534 | C: 0.66954
[LOGITS Ex2 A] Mean Abs: 1.420 | Max: 5.023
[LOSS Ex2] A: 0.23913 | B: 0.39646 | C: 0.32575
** [JOINT LOSS] ** : 0.994131
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002862 | Grad Max: 0.075379
  -> Layer: shared_layers.0.bias | Grad Mean: 0.100342 | Grad Max: 0.420279
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.008575
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011295 | Grad Max: 0.011295
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000777 | Grad Max: 0.089578
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013429 | Grad Max: 0.505148
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.004431
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005495 | Grad Max: 0.024086
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000372
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001160 | Grad Max: 0.003639
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000140
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000385 | Grad Max: 0.001228
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000712 | Grad Max: 0.002590
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011919 | Grad Max: 0.011919
[GRADIENT NORM TOTAL] 2.0667

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.185
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5674179  0.43258202] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 315/1733 | B: 301/1747 | C: 257/1791
[LOSS Ex1] A: 0.67454 | B: 0.67481 | C: 0.67065
[LOGITS Ex2 A] Mean Abs: 1.394 | Max: 5.564
[LOSS Ex2] A: 0.25725 | B: 0.43894 | C: 0.35901
** [JOINT LOSS] ** : 1.025068
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005987 | Grad Max: 0.145733
  -> Layer: shared_layers.0.bias | Grad Mean: 0.340487 | Grad Max: 1.622887
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001966 | Grad Max: 0.008228
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006025 | Grad Max: 0.006025
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.245523
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043702 | Grad Max: 1.388818
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000504 | Grad Max: 0.011210
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020267 | Grad Max: 0.068426
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000927
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004479 | Grad Max: 0.010233
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000403
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001441 | Grad Max: 0.003566
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002627 | Grad Max: 0.005160
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042109 | Grad Max: 0.042109
[GRADIENT NORM TOTAL] 6.5320

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.246
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54112005 0.45887992] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.023
[MASKS] A(Pass/Fail): 266/1350 | B: 316/1732 | C: 257/1791
[LOSS Ex1] A: 0.67358 | B: 0.67527 | C: 0.67116
[LOGITS Ex2 A] Mean Abs: 1.400 | Max: 5.126
[LOSS Ex2] A: 0.25172 | B: 0.43402 | C: 0.38199
** [JOINT LOSS] ** : 1.029249
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007664 | Grad Max: 0.181387
  -> Layer: shared_layers.0.bias | Grad Mean: 0.469052 | Grad Max: 2.290030
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001941 | Grad Max: 0.007935
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002736 | Grad Max: 0.002736
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003184 | Grad Max: 0.276009
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060484 | Grad Max: 1.556705
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000712 | Grad Max: 0.017963
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028668 | Grad Max: 0.102595
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000106 | Grad Max: 0.001191
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006335 | Grad Max: 0.014011
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000519
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002058 | Grad Max: 0.004697
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003959 | Grad Max: 0.007817
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.063501 | Grad Max: 0.063501
[GRADIENT NORM TOTAL] 8.9677

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.379
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50710845 0.49289155] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.023
[MASKS] A(Pass/Fail): 339/1709 | B: 328/1720 | C: 244/1804
[LOSS Ex1] A: 0.67381 | B: 0.67269 | C: 0.67137
[LOGITS Ex2 A] Mean Abs: 1.422 | Max: 6.800
[LOSS Ex2] A: 0.24371 | B: 0.41329 | C: 0.36191
** [JOINT LOSS] ** : 1.012262
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005222 | Grad Max: 0.119860
  -> Layer: shared_layers.0.bias | Grad Mean: 0.235330 | Grad Max: 1.206516
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.008766
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010936 | Grad Max: 0.010936
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001756 | Grad Max: 0.216859
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033181 | Grad Max: 1.225959
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000377 | Grad Max: 0.008801
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015098 | Grad Max: 0.051123
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000756
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003366 | Grad Max: 0.007808
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000313
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001081 | Grad Max: 0.002635
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002009 | Grad Max: 0.003856
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031502 | Grad Max: 0.031502
[GRADIENT NORM TOTAL] 4.7631

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.354
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50364685 0.49635312] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 349/1699 | B: 294/1562 | C: 264/1784
[LOSS Ex1] A: 0.67291 | B: 0.67524 | C: 0.66935
[LOGITS Ex2 A] Mean Abs: 1.456 | Max: 5.766
[LOSS Ex2] A: 0.24059 | B: 0.39927 | C: 0.36433
** [JOINT LOSS] ** : 1.007230
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003549 | Grad Max: 0.092070
  -> Layer: shared_layers.0.bias | Grad Mean: 0.164763 | Grad Max: 0.806368
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.008846
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007803 | Grad Max: 0.007803
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001111 | Grad Max: 0.107016
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020739 | Grad Max: 0.538420
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000238 | Grad Max: 0.006250
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009528 | Grad Max: 0.033198
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000493
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002116 | Grad Max: 0.004998
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000220
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000674 | Grad Max: 0.001720
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001059 | Grad Max: 0.002635
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018325 | Grad Max: 0.018325
[GRADIENT NORM TOTAL] 3.1462

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.363
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50328034 0.49671966] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 331/1717 | B: 303/1745 | C: 240/1808
[LOSS Ex1] A: 0.67201 | B: 0.67470 | C: 0.67196
[LOGITS Ex2 A] Mean Abs: 1.472 | Max: 6.283
[LOSS Ex2] A: 0.27475 | B: 0.42775 | C: 0.35753
** [JOINT LOSS] ** : 1.026232
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007963 | Grad Max: 0.244065
  -> Layer: shared_layers.0.bias | Grad Mean: 0.299226 | Grad Max: 1.429959
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002000 | Grad Max: 0.008626
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005784 | Grad Max: 0.005784
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.164164
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040105 | Grad Max: 0.809773
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.010760
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017758 | Grad Max: 0.059633
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000921
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004014 | Grad Max: 0.009551
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000357
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001283 | Grad Max: 0.003265
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002250 | Grad Max: 0.004287
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036635 | Grad Max: 0.036635
[GRADIENT NORM TOTAL] 5.6638

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.309
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062355  0.49376455] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 311/1737 | B: 316/1732 | C: 233/1815
[LOSS Ex1] A: 0.67561 | B: 0.67516 | C: 0.67115
[LOGITS Ex2 A] Mean Abs: 1.415 | Max: 6.110
[LOSS Ex2] A: 0.23993 | B: 0.41279 | C: 0.33798
** [JOINT LOSS] ** : 1.004204
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003475 | Grad Max: 0.082888
  -> Layer: shared_layers.0.bias | Grad Mean: 0.146103 | Grad Max: 0.635050
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001892 | Grad Max: 0.007723
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005645 | Grad Max: 0.005645
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000983 | Grad Max: 0.102319
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018022 | Grad Max: 0.536336
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000201 | Grad Max: 0.005774
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007999 | Grad Max: 0.030094
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000431
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001781 | Grad Max: 0.004341
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000220
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000582 | Grad Max: 0.001701
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000966 | Grad Max: 0.003244
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017062 | Grad Max: 0.017062
[GRADIENT NORM TOTAL] 2.7510

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.213
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51833415 0.48166585] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 316/1732 | B: 330/1718 | C: 261/1787
[LOSS Ex1] A: 0.67513 | B: 0.67258 | C: 0.67078
[LOGITS Ex2 A] Mean Abs: 1.350 | Max: 5.322
[LOSS Ex2] A: 0.25430 | B: 0.40586 | C: 0.34371
** [JOINT LOSS] ** : 1.007449
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004681 | Grad Max: 0.102360
  -> Layer: shared_layers.0.bias | Grad Mean: 0.279930 | Grad Max: 1.379538
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001936 | Grad Max: 0.007764
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000125 | Grad Max: 0.000125
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001917 | Grad Max: 0.234994
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036600 | Grad Max: 1.314459
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000420 | Grad Max: 0.010253
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017070 | Grad Max: 0.060058
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000749
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003772 | Grad Max: 0.008252
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000362
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001199 | Grad Max: 0.003149
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002180 | Grad Max: 0.004278
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034602 | Grad Max: 0.034602
[GRADIENT NORM TOTAL] 5.5036

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.340
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57389414 0.42610592] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.023
[MASKS] A(Pass/Fail): 354/1694 | B: 296/1560 | C: 195/1181
[LOSS Ex1] A: 0.67313 | B: 0.67513 | C: 0.66736
[LOGITS Ex2 A] Mean Abs: 1.392 | Max: 5.670
[LOSS Ex2] A: 0.23748 | B: 0.40828 | C: 0.34933
** [JOINT LOSS] ** : 1.003569
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005729 | Grad Max: 0.116342
  -> Layer: shared_layers.0.bias | Grad Mean: 0.354439 | Grad Max: 1.629878
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.009254
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012771 | Grad Max: 0.012771
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002383 | Grad Max: 0.237787
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044997 | Grad Max: 1.339518
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000526 | Grad Max: 0.013703
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021180 | Grad Max: 0.076581
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000976
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004684 | Grad Max: 0.010377
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000425
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001488 | Grad Max: 0.003873
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002598 | Grad Max: 0.005018
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042148 | Grad Max: 0.042148
[GRADIENT NORM TOTAL] 6.8622

[EPOCH SUMMARY] Train Loss: 1.0116

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9857 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 49/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.381
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5019112 0.4980888] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 321/1727 | B: 303/1745 | C: 250/1798
[LOSS Ex1] A: 0.67598 | B: 0.67459 | C: 0.67147
[LOGITS Ex2 A] Mean Abs: 1.426 | Max: 5.379
[LOSS Ex2] A: 0.23698 | B: 0.42669 | C: 0.34435
** [JOINT LOSS] ** : 1.010024
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004422 | Grad Max: 0.145876
  -> Layer: shared_layers.0.bias | Grad Mean: 0.183203 | Grad Max: 0.811981
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.008453
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011506 | Grad Max: 0.011506
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001291 | Grad Max: 0.114021
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023058 | Grad Max: 0.613790
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000262 | Grad Max: 0.006081
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010355 | Grad Max: 0.034552
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000508
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002370 | Grad Max: 0.005641
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000239
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000762 | Grad Max: 0.001961
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001491 | Grad Max: 0.003023
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022665 | Grad Max: 0.022665
[GRADIENT NORM TOTAL] 3.4220

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.187
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.56826067 0.4317393 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 321/1727 | B: 316/1732 | C: 258/1790
[LOSS Ex1] A: 0.67434 | B: 0.67505 | C: 0.67002
[LOGITS Ex2 A] Mean Abs: 1.461 | Max: 5.272
[LOSS Ex2] A: 0.25424 | B: 0.42857 | C: 0.35394
** [JOINT LOSS] ** : 1.018719
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003143 | Grad Max: 0.098847
  -> Layer: shared_layers.0.bias | Grad Mean: 0.291476 | Grad Max: 1.440985
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001928 | Grad Max: 0.007660
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001133 | Grad Max: 0.001133
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001918 | Grad Max: 0.155394
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036218 | Grad Max: 0.886967
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000414 | Grad Max: 0.011219
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016831 | Grad Max: 0.064059
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000733
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003689 | Grad Max: 0.008115
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000329
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001177 | Grad Max: 0.002950
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001988 | Grad Max: 0.004230
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033648 | Grad Max: 0.033648
[GRADIENT NORM TOTAL] 5.7523

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.248
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54166    0.45834005] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.023
[MASKS] A(Pass/Fail): 274/1342 | B: 332/1716 | C: 227/1821
[LOSS Ex1] A: 0.67338 | B: 0.67246 | C: 0.67345
[LOGITS Ex2 A] Mean Abs: 1.521 | Max: 5.480
[LOSS Ex2] A: 0.24799 | B: 0.41497 | C: 0.34281
** [JOINT LOSS] ** : 1.008353
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005495 | Grad Max: 0.148288
  -> Layer: shared_layers.0.bias | Grad Mean: 0.374406 | Grad Max: 1.865800
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001958 | Grad Max: 0.008304
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003088 | Grad Max: 0.003088
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002536 | Grad Max: 0.203690
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048059 | Grad Max: 1.115474
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000553 | Grad Max: 0.013723
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022465 | Grad Max: 0.084186
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.001016
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004938 | Grad Max: 0.010793
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000443
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001581 | Grad Max: 0.003747
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002808 | Grad Max: 0.005420
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046396 | Grad Max: 0.046396
[GRADIENT NORM TOTAL] 7.2566

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.382
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070578  0.49294224] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 346/1702 | B: 298/1558 | C: 247/1801
[LOSS Ex1] A: 0.67361 | B: 0.67503 | C: 0.67124
[LOGITS Ex2 A] Mean Abs: 1.478 | Max: 6.196
[LOSS Ex2] A: 0.25121 | B: 0.40171 | C: 0.34980
** [JOINT LOSS] ** : 1.007534
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004129 | Grad Max: 0.097160
  -> Layer: shared_layers.0.bias | Grad Mean: 0.264942 | Grad Max: 1.267545
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002014 | Grad Max: 0.008868
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012212 | Grad Max: 0.012212
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001804 | Grad Max: 0.148551
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034176 | Grad Max: 0.794081
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000396 | Grad Max: 0.010102
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015995 | Grad Max: 0.059016
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000682
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003517 | Grad Max: 0.007714
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000331
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001132 | Grad Max: 0.002867
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002003 | Grad Max: 0.004393
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033186 | Grad Max: 0.033186
[GRADIENT NORM TOTAL] 5.0804

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.356
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037253  0.49627474] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 360/1688 | B: 303/1745 | C: 277/1771
[LOSS Ex1] A: 0.67269 | B: 0.67449 | C: 0.66879
[LOGITS Ex2 A] Mean Abs: 1.426 | Max: 5.024
[LOSS Ex2] A: 0.23717 | B: 0.43048 | C: 0.34024
** [JOINT LOSS] ** : 1.007953
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002951 | Grad Max: 0.072665
  -> Layer: shared_layers.0.bias | Grad Mean: 0.155143 | Grad Max: 0.694074
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.008986
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007887 | Grad Max: 0.007887
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001082 | Grad Max: 0.140860
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020033 | Grad Max: 0.798523
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000226 | Grad Max: 0.005639
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009084 | Grad Max: 0.035319
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000485
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002027 | Grad Max: 0.005274
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000651 | Grad Max: 0.001726
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001259 | Grad Max: 0.002814
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019391 | Grad Max: 0.019391
[GRADIENT NORM TOTAL] 3.0595

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.366
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50322175 0.49677828] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 339/1709 | B: 317/1731 | C: 279/1769
[LOSS Ex1] A: 0.67178 | B: 0.67495 | C: 0.66835
[LOGITS Ex2 A] Mean Abs: 1.424 | Max: 5.301
[LOSS Ex2] A: 0.25091 | B: 0.42626 | C: 0.34325
** [JOINT LOSS] ** : 1.011835
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003358 | Grad Max: 0.110652
  -> Layer: shared_layers.0.bias | Grad Mean: 0.269989 | Grad Max: 1.307577
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.008733
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003176 | Grad Max: 0.003176
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001737 | Grad Max: 0.157471
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032943 | Grad Max: 0.899085
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.010563
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015887 | Grad Max: 0.064790
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000813
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003481 | Grad Max: 0.008576
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000315
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001113 | Grad Max: 0.002826
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002053 | Grad Max: 0.003763
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033173 | Grad Max: 0.033173
[GRADIENT NORM TOTAL] 5.1228

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.311
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061836  0.49381638] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 319/1729 | B: 335/1713 | C: 241/1807
[LOSS Ex1] A: 0.67542 | B: 0.67236 | C: 0.67179
[LOGITS Ex2 A] Mean Abs: 1.412 | Max: 5.554
[LOSS Ex2] A: 0.24160 | B: 0.41614 | C: 0.34362
** [JOINT LOSS] ** : 1.006977
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002237 | Grad Max: 0.054095
  -> Layer: shared_layers.0.bias | Grad Mean: 0.107348 | Grad Max: 0.487001
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001895 | Grad Max: 0.006897
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001095 | Grad Max: 0.001095
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000752 | Grad Max: 0.097512
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013722 | Grad Max: 0.551173
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000159 | Grad Max: 0.005537
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006342 | Grad Max: 0.025139
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000449
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001388 | Grad Max: 0.003964
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000160
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000442 | Grad Max: 0.001352
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000939 | Grad Max: 0.002515
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013870 | Grad Max: 0.013870
[GRADIENT NORM TOTAL] 2.0853

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.214
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5185363 0.4814637] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 318/1730 | B: 300/1556 | C: 273/1775
[LOSS Ex1] A: 0.67495 | B: 0.67493 | C: 0.66839
[LOGITS Ex2 A] Mean Abs: 1.439 | Max: 5.222
[LOSS Ex2] A: 0.26271 | B: 0.40449 | C: 0.33393
** [JOINT LOSS] ** : 1.006464
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007252 | Grad Max: 0.185921
  -> Layer: shared_layers.0.bias | Grad Mean: 0.351470 | Grad Max: 1.634982
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001975 | Grad Max: 0.008419
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007019 | Grad Max: 0.007019
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002426 | Grad Max: 0.216047
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045859 | Grad Max: 1.128587
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000530 | Grad Max: 0.014215
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021363 | Grad Max: 0.074842
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000927
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004735 | Grad Max: 0.010152
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000416
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001523 | Grad Max: 0.003701
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002849 | Grad Max: 0.005520
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045476 | Grad Max: 0.045476
[GRADIENT NORM TOTAL] 6.6687

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.343
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.574838   0.42516208] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 358/1690 | B: 305/1743 | C: 253/1795
[LOSS Ex1] A: 0.67291 | B: 0.67438 | C: 0.67045
[LOGITS Ex2 A] Mean Abs: 1.486 | Max: 5.337
[LOSS Ex2] A: 0.26064 | B: 0.43947 | C: 0.36787
** [JOINT LOSS] ** : 1.028577
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009681 | Grad Max: 0.239559
  -> Layer: shared_layers.0.bias | Grad Mean: 0.449521 | Grad Max: 2.128288
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.009690
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017375 | Grad Max: 0.017375
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003166 | Grad Max: 0.257063
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059208 | Grad Max: 1.353118
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000689 | Grad Max: 0.016861
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027717 | Grad Max: 0.099589
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001310
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006190 | Grad Max: 0.014195
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000512
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001977 | Grad Max: 0.004492
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003556 | Grad Max: 0.006029
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057252 | Grad Max: 0.057252
[GRADIENT NORM TOTAL] 8.5015

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.384
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50186664 0.49813333] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.023
[MASKS] A(Pass/Fail): 326/1722 | B: 318/1730 | C: 290/1758
[LOSS Ex1] A: 0.67580 | B: 0.67485 | C: 0.66564
[LOGITS Ex2 A] Mean Abs: 1.460 | Max: 5.316
[LOSS Ex2] A: 0.23878 | B: 0.41892 | C: 0.35139
** [JOINT LOSS] ** : 1.008461
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002857 | Grad Max: 0.091631
  -> Layer: shared_layers.0.bias | Grad Mean: 0.246648 | Grad Max: 1.220353
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.007600
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000732 | Grad Max: 0.000732
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001636 | Grad Max: 0.140782
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030939 | Grad Max: 0.803134
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.011060
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014642 | Grad Max: 0.061118
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000658
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003168 | Grad Max: 0.006723
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000308
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001011 | Grad Max: 0.002407
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001743 | Grad Max: 0.003981
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029710 | Grad Max: 0.029710
[GRADIENT NORM TOTAL] 4.8182

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.189
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5690585 0.4309415] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 324/1724 | B: 336/1712 | C: 250/1798
[LOSS Ex1] A: 0.67414 | B: 0.67225 | C: 0.67176
[LOGITS Ex2 A] Mean Abs: 1.422 | Max: 5.127
[LOSS Ex2] A: 0.24532 | B: 0.40110 | C: 0.35459
** [JOINT LOSS] ** : 1.006387
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003815 | Grad Max: 0.076548
  -> Layer: shared_layers.0.bias | Grad Mean: 0.204464 | Grad Max: 0.998095
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001960 | Grad Max: 0.008163
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005307 | Grad Max: 0.005307
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001476 | Grad Max: 0.099600
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027758 | Grad Max: 0.577335
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.007975
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013379 | Grad Max: 0.046206
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000664
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002969 | Grad Max: 0.006802
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000286
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000951 | Grad Max: 0.002486
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001763 | Grad Max: 0.003617
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027690 | Grad Max: 0.027690
[GRADIENT NORM TOTAL] 3.9128

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.251
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54218817 0.4578119 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 275/1341 | B: 301/1555 | C: 258/1790
[LOSS Ex1] A: 0.67316 | B: 0.67483 | C: 0.66976
[LOGITS Ex2 A] Mean Abs: 1.445 | Max: 5.372
[LOSS Ex2] A: 0.23473 | B: 0.39777 | C: 0.36634
** [JOINT LOSS] ** : 1.005534
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005472 | Grad Max: 0.131594
  -> Layer: shared_layers.0.bias | Grad Mean: 0.315473 | Grad Max: 1.632601
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.008041
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000566 | Grad Max: 0.000566
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.164620
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041600 | Grad Max: 0.918911
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000489 | Grad Max: 0.011343
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019775 | Grad Max: 0.068297
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000913
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004368 | Grad Max: 0.009695
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000357
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001394 | Grad Max: 0.003198
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002548 | Grad Max: 0.004919
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040646 | Grad Max: 0.040646
[GRADIENT NORM TOTAL] 6.1378

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.385
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070553 0.4929447] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 351/1697 | B: 307/1741 | C: 245/1803
[LOSS Ex1] A: 0.67340 | B: 0.67429 | C: 0.67192
[LOGITS Ex2 A] Mean Abs: 1.453 | Max: 5.684
[LOSS Ex2] A: 0.24686 | B: 0.42572 | C: 0.36404
** [JOINT LOSS] ** : 1.018739
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002223 | Grad Max: 0.051510
  -> Layer: shared_layers.0.bias | Grad Mean: 0.116113 | Grad Max: 0.658866
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.008666
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011018 | Grad Max: 0.011018
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000853 | Grad Max: 0.122965
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015817 | Grad Max: 0.694469
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000175 | Grad Max: 0.006143
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007054 | Grad Max: 0.032651
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000379
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001554 | Grad Max: 0.003744
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000170
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000496 | Grad Max: 0.001517
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000926 | Grad Max: 0.002128
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014768 | Grad Max: 0.014768
[GRADIENT NORM TOTAL] 2.4642

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.359
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037591  0.49624097] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 366/1682 | B: 319/1729 | C: 181/1195
[LOSS Ex1] A: 0.67246 | B: 0.67475 | C: 0.66971
[LOGITS Ex2 A] Mean Abs: 1.476 | Max: 5.255
[LOSS Ex2] A: 0.25467 | B: 0.42795 | C: 0.33720
** [JOINT LOSS] ** : 1.012245
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006526 | Grad Max: 0.155827
  -> Layer: shared_layers.0.bias | Grad Mean: 0.317864 | Grad Max: 1.500377
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.009013
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010238 | Grad Max: 0.010238
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.210533
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041964 | Grad Max: 1.131443
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000478 | Grad Max: 0.011216
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019230 | Grad Max: 0.066830
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000858
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004280 | Grad Max: 0.009413
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000387
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001370 | Grad Max: 0.003415
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002370 | Grad Max: 0.005064
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039193 | Grad Max: 0.039193
[GRADIENT NORM TOTAL] 6.1274

[EPOCH SUMMARY] Train Loss: 1.0113

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9955 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 50/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.369
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5031915  0.49680853] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 344/1704 | B: 336/1712 | C: 274/1774
[LOSS Ex1] A: 0.67156 | B: 0.67214 | C: 0.66856
[LOGITS Ex2 A] Mean Abs: 1.496 | Max: 5.494
[LOSS Ex2] A: 0.26582 | B: 0.41595 | C: 0.35416
** [JOINT LOSS] ** : 1.016061
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008499 | Grad Max: 0.247885
  -> Layer: shared_layers.0.bias | Grad Mean: 0.414963 | Grad Max: 1.964271
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.009092
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002839 | Grad Max: 0.002839
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003017 | Grad Max: 0.256202
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056315 | Grad Max: 1.359728
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000641 | Grad Max: 0.015634
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025793 | Grad Max: 0.089253
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.001151
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005764 | Grad Max: 0.012593
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000490
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001852 | Grad Max: 0.004381
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003374 | Grad Max: 0.006115
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.054384 | Grad Max: 0.054384
[GRADIENT NORM TOTAL] 8.0997

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.313
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061659 0.4938341] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 323/1725 | B: 302/1554 | C: 269/1779
[LOSS Ex1] A: 0.67523 | B: 0.67473 | C: 0.66994
[LOGITS Ex2 A] Mean Abs: 1.445 | Max: 5.276
[LOSS Ex2] A: 0.23793 | B: 0.39695 | C: 0.31754
** [JOINT LOSS] ** : 0.990775
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003053 | Grad Max: 0.081947
  -> Layer: shared_layers.0.bias | Grad Mean: 0.201856 | Grad Max: 0.960309
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001910 | Grad Max: 0.007706
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005930 | Grad Max: 0.005930
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001388 | Grad Max: 0.132629
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026115 | Grad Max: 0.756341
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000297 | Grad Max: 0.007266
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012018 | Grad Max: 0.044325
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000530
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002643 | Grad Max: 0.005972
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000258
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000862 | Grad Max: 0.002168
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001501 | Grad Max: 0.004006
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025943 | Grad Max: 0.025943
[GRADIENT NORM TOTAL] 4.0163

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.216
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5187216  0.48127842] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 321/1727 | B: 307/1741 | C: 269/1779
[LOSS Ex1] A: 0.67475 | B: 0.67418 | C: 0.66947
[LOGITS Ex2 A] Mean Abs: 1.367 | Max: 5.640
[LOSS Ex2] A: 0.24942 | B: 0.42555 | C: 0.35640
** [JOINT LOSS] ** : 1.016592
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004380 | Grad Max: 0.096356
  -> Layer: shared_layers.0.bias | Grad Mean: 0.230850 | Grad Max: 1.059020
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001910 | Grad Max: 0.007797
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001464 | Grad Max: 0.001464
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001610 | Grad Max: 0.210220
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029965 | Grad Max: 1.186436
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.009176
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013306 | Grad Max: 0.051251
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000624
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002956 | Grad Max: 0.006914
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000313
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000952 | Grad Max: 0.002719
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001836 | Grad Max: 0.003630
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028358 | Grad Max: 0.028358
[GRADIENT NORM TOTAL] 4.5623

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.346
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5757721  0.42422792] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.024
[MASKS] A(Pass/Fail): 361/1687 | B: 320/1728 | C: 257/1791
[LOSS Ex1] A: 0.67269 | B: 0.67465 | C: 0.67097
[LOGITS Ex2 A] Mean Abs: 1.422 | Max: 5.493
[LOSS Ex2] A: 0.23309 | B: 0.42223 | C: 0.35879
** [JOINT LOSS] ** : 1.010808
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006571 | Grad Max: 0.140868
  -> Layer: shared_layers.0.bias | Grad Mean: 0.345074 | Grad Max: 1.612147
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.008955
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014927 | Grad Max: 0.014927
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002362 | Grad Max: 0.243817
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044677 | Grad Max: 1.369806
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000510 | Grad Max: 0.012645
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020707 | Grad Max: 0.073093
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000895
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004603 | Grad Max: 0.009990
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000405
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001471 | Grad Max: 0.003597
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002679 | Grad Max: 0.004847
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043245 | Grad Max: 0.043245
[GRADIENT NORM TOTAL] 6.5637

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.387
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5018333  0.49816668] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 334/1714 | B: 337/1711 | C: 247/1801
[LOSS Ex1] A: 0.67561 | B: 0.67204 | C: 0.67011
[LOGITS Ex2 A] Mean Abs: 1.429 | Max: 5.414
[LOSS Ex2] A: 0.23696 | B: 0.40309 | C: 0.35957
** [JOINT LOSS] ** : 1.005790
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004796 | Grad Max: 0.122278
  -> Layer: shared_layers.0.bias | Grad Mean: 0.161628 | Grad Max: 0.738087
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.006920
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001955 | Grad Max: 0.001955
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001197 | Grad Max: 0.109482
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021861 | Grad Max: 0.544010
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000252 | Grad Max: 0.006445
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010061 | Grad Max: 0.036301
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000459
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002286 | Grad Max: 0.005206
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000230
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000733 | Grad Max: 0.001933
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001438 | Grad Max: 0.003292
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021587 | Grad Max: 0.021587
[GRADIENT NORM TOTAL] 3.0091

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.191
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.56983334 0.4301666 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 330/1718 | B: 304/1552 | C: 243/1805
[LOSS Ex1] A: 0.67394 | B: 0.67463 | C: 0.67091
[LOGITS Ex2 A] Mean Abs: 1.471 | Max: 4.934
[LOSS Ex2] A: 0.25913 | B: 0.39655 | C: 0.36861
** [JOINT LOSS] ** : 1.014590
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004916 | Grad Max: 0.114569
  -> Layer: shared_layers.0.bias | Grad Mean: 0.282052 | Grad Max: 1.417381
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001980 | Grad Max: 0.008726
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009197 | Grad Max: 0.009197
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.169571
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037542 | Grad Max: 0.947580
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000434 | Grad Max: 0.009851
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017664 | Grad Max: 0.061555
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000844
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003888 | Grad Max: 0.008719
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000338
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001243 | Grad Max: 0.003037
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002127 | Grad Max: 0.004141
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036037 | Grad Max: 0.036037
[GRADIENT NORM TOTAL] 5.5678

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.254
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5427067 0.4572933] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 281/1335 | B: 307/1741 | C: 282/1766
[LOSS Ex1] A: 0.67296 | B: 0.67408 | C: 0.66731
[LOGITS Ex2 A] Mean Abs: 1.530 | Max: 5.176
[LOSS Ex2] A: 0.24871 | B: 0.43854 | C: 0.33652
** [JOINT LOSS] ** : 1.012708
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006870 | Grad Max: 0.162918
  -> Layer: shared_layers.0.bias | Grad Mean: 0.359780 | Grad Max: 1.718003
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002053 | Grad Max: 0.008061
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002393 | Grad Max: 0.002393
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002474 | Grad Max: 0.215579
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046797 | Grad Max: 1.079177
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000520 | Grad Max: 0.014018
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021142 | Grad Max: 0.079253
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000959
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004680 | Grad Max: 0.010559
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000406
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001494 | Grad Max: 0.003650
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002558 | Grad Max: 0.004614
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042862 | Grad Max: 0.042862
[GRADIENT NORM TOTAL] 6.8881

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.388
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070343 0.4929657] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 355/1693 | B: 322/1726 | C: 272/1776
[LOSS Ex1] A: 0.67319 | B: 0.67455 | C: 0.66937
[LOGITS Ex2 A] Mean Abs: 1.490 | Max: 5.213
[LOSS Ex2] A: 0.24021 | B: 0.42016 | C: 0.35874
** [JOINT LOSS] ** : 1.012076
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004637 | Grad Max: 0.105715
  -> Layer: shared_layers.0.bias | Grad Mean: 0.227019 | Grad Max: 1.062659
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001968 | Grad Max: 0.008151
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005481 | Grad Max: 0.005481
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001592 | Grad Max: 0.127068
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030012 | Grad Max: 0.668789
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.009104
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013491 | Grad Max: 0.049564
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000697
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003008 | Grad Max: 0.007798
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000282
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000971 | Grad Max: 0.002486
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001780 | Grad Max: 0.003940
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028997 | Grad Max: 0.028997
[GRADIENT NORM TOTAL] 4.3583

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.362
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5038211  0.49617893] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 368/1680 | B: 337/1711 | C: 248/1800
[LOSS Ex1] A: 0.67225 | B: 0.67193 | C: 0.67146
[LOGITS Ex2 A] Mean Abs: 1.437 | Max: 6.041
[LOSS Ex2] A: 0.23847 | B: 0.40604 | C: 0.33444
** [JOINT LOSS] ** : 0.998199
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003294 | Grad Max: 0.067005
  -> Layer: shared_layers.0.bias | Grad Mean: 0.198651 | Grad Max: 0.936816
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.009321
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014423 | Grad Max: 0.014423
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001351 | Grad Max: 0.155874
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025233 | Grad Max: 0.882567
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000282 | Grad Max: 0.006890
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011478 | Grad Max: 0.038858
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000572
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002547 | Grad Max: 0.005877
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000260
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000805 | Grad Max: 0.002125
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001447 | Grad Max: 0.003335
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022929 | Grad Max: 0.022929
[GRADIENT NORM TOTAL] 3.8441

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.372
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50317067 0.4968293 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 348/1700 | B: 304/1552 | C: 260/1788
[LOSS Ex1] A: 0.67134 | B: 0.67454 | C: 0.66928
[LOGITS Ex2 A] Mean Abs: 1.425 | Max: 6.182
[LOSS Ex2] A: 0.25569 | B: 0.40224 | C: 0.32377
** [JOINT LOSS] ** : 0.998955
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004139 | Grad Max: 0.119781
  -> Layer: shared_layers.0.bias | Grad Mean: 0.343926 | Grad Max: 1.588207
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.009617
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011643 | Grad Max: 0.011643
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.248781
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042160 | Grad Max: 1.395509
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000474 | Grad Max: 0.014206
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019403 | Grad Max: 0.084519
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.001011
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004254 | Grad Max: 0.010517
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000357
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001338 | Grad Max: 0.003252
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002223 | Grad Max: 0.004204
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036953 | Grad Max: 0.036953
[GRADIENT NORM TOTAL] 6.7585

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.315
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061435  0.49385652] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.024
[MASKS] A(Pass/Fail): 327/1721 | B: 307/1741 | C: 259/1789
[LOSS Ex1] A: 0.67505 | B: 0.67398 | C: 0.67001
[LOGITS Ex2 A] Mean Abs: 1.409 | Max: 5.944
[LOSS Ex2] A: 0.23175 | B: 0.43513 | C: 0.34484
** [JOINT LOSS] ** : 1.010254
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002467 | Grad Max: 0.062172
  -> Layer: shared_layers.0.bias | Grad Mean: 0.180236 | Grad Max: 0.829815
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001948 | Grad Max: 0.007788
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008998 | Grad Max: 0.008998
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001200 | Grad Max: 0.163586
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022437 | Grad Max: 0.929825
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.007071
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010023 | Grad Max: 0.042144
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000504
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002198 | Grad Max: 0.005241
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000234
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000691 | Grad Max: 0.001857
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001211 | Grad Max: 0.002655
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019130 | Grad Max: 0.019130
[GRADIENT NORM TOTAL] 3.6128

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.217
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51891583 0.4810841 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.023
[MASKS] A(Pass/Fail): 327/1721 | B: 325/1723 | C: 240/1808
[LOSS Ex1] A: 0.67458 | B: 0.67445 | C: 0.67134
[LOGITS Ex2 A] Mean Abs: 1.428 | Max: 5.975
[LOSS Ex2] A: 0.25383 | B: 0.42583 | C: 0.37255
** [JOINT LOSS] ** : 1.024194
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004384 | Grad Max: 0.116961
  -> Layer: shared_layers.0.bias | Grad Mean: 0.183233 | Grad Max: 0.814503
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001847 | Grad Max: 0.007299
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002385 | Grad Max: 0.002385
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001278 | Grad Max: 0.101625
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023693 | Grad Max: 0.515517
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000274 | Grad Max: 0.006649
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011095 | Grad Max: 0.039879
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000604
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002481 | Grad Max: 0.006214
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000242
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000780 | Grad Max: 0.002060
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001254 | Grad Max: 0.002730
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021212 | Grad Max: 0.021212
[GRADIENT NORM TOTAL] 3.4462

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.348
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57665694 0.4233431 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.024
[MASKS] A(Pass/Fail): 365/1683 | B: 337/1711 | C: 249/1799
[LOSS Ex1] A: 0.67249 | B: 0.67183 | C: 0.66977
[LOGITS Ex2 A] Mean Abs: 1.473 | Max: 5.409
[LOSS Ex2] A: 0.23475 | B: 0.41296 | C: 0.35357
** [JOINT LOSS] ** : 1.005122
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005777 | Grad Max: 0.150724
  -> Layer: shared_layers.0.bias | Grad Mean: 0.289286 | Grad Max: 1.326064
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.008883
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008923 | Grad Max: 0.008923
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002012 | Grad Max: 0.201273
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037787 | Grad Max: 1.021285
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000426 | Grad Max: 0.009132
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017238 | Grad Max: 0.057017
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000811
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003809 | Grad Max: 0.009134
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000330
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001222 | Grad Max: 0.002982
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002148 | Grad Max: 0.004764
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035822 | Grad Max: 0.035822
[GRADIENT NORM TOTAL] 5.5407

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.390
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5018011  0.49819887] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 337/1711 | B: 305/1551 | C: 172/1204
[LOSS Ex1] A: 0.67543 | B: 0.67444 | C: 0.66885
[LOGITS Ex2 A] Mean Abs: 1.455 | Max: 5.055
[LOSS Ex2] A: 0.22164 | B: 0.40126 | C: 0.34001
** [JOINT LOSS] ** : 0.993878
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.049307
  -> Layer: shared_layers.0.bias | Grad Mean: 0.064425 | Grad Max: 0.292392
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001918 | Grad Max: 0.007677
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006317 | Grad Max: 0.006317
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000508 | Grad Max: 0.065038
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008576 | Grad Max: 0.360260
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000078 | Grad Max: 0.004464
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002888 | Grad Max: 0.022902
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000219
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000577 | Grad Max: 0.002176
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000114
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000189 | Grad Max: 0.000800
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000603 | Grad Max: 0.001706
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005836 | Grad Max: 0.005836
[GRADIENT NORM TOTAL] 1.3869

[EPOCH SUMMARY] Train Loss: 1.0079

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9889 | Alpha: 0.5500
No improve count: 3/15

############################## EPOCH 51/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.193
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5706051  0.42939496] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 334/1714 | B: 307/1741 | C: 293/1755
[LOSS Ex1] A: 0.67374 | B: 0.67387 | C: 0.66797
[LOGITS Ex2 A] Mean Abs: 1.404 | Max: 5.588
[LOSS Ex2] A: 0.24576 | B: 0.43767 | C: 0.33676
** [JOINT LOSS] ** : 1.011920
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005406 | Grad Max: 0.113882
  -> Layer: shared_layers.0.bias | Grad Mean: 0.349164 | Grad Max: 1.593858
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001997 | Grad Max: 0.008270
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004772 | Grad Max: 0.004772
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002382 | Grad Max: 0.256129
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045010 | Grad Max: 1.447117
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000504 | Grad Max: 0.012334
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020625 | Grad Max: 0.073955
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000926
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004592 | Grad Max: 0.010659
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000379
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001461 | Grad Max: 0.003545
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002473 | Grad Max: 0.004379
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040549 | Grad Max: 0.040549
[GRADIENT NORM TOTAL] 6.7722

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.256
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54322463 0.45677543] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.024
[MASKS] A(Pass/Fail): 285/1331 | B: 327/1721 | C: 245/1803
[LOSS Ex1] A: 0.67276 | B: 0.67434 | C: 0.67094
[LOGITS Ex2 A] Mean Abs: 1.457 | Max: 5.700
[LOSS Ex2] A: 0.24072 | B: 0.43893 | C: 0.33051
** [JOINT LOSS] ** : 1.009400
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007222 | Grad Max: 0.170503
  -> Layer: shared_layers.0.bias | Grad Mean: 0.418495 | Grad Max: 1.896594
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001948 | Grad Max: 0.008402
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004747 | Grad Max: 0.004747
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002884 | Grad Max: 0.293898
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054185 | Grad Max: 1.663638
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000624 | Grad Max: 0.015625
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025368 | Grad Max: 0.090619
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001058
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005634 | Grad Max: 0.011815
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000481
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001808 | Grad Max: 0.004284
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003256 | Grad Max: 0.005914
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053134 | Grad Max: 0.053134
[GRADIENT NORM TOTAL] 8.0885

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.391
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50701725 0.4929827 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.025
[MASKS] A(Pass/Fail): 359/1689 | B: 337/1711 | C: 244/1804
[LOSS Ex1] A: 0.67299 | B: 0.67171 | C: 0.66885
[LOGITS Ex2 A] Mean Abs: 1.459 | Max: 5.254
[LOSS Ex2] A: 0.24253 | B: 0.40792 | C: 0.36284
** [JOINT LOSS] ** : 1.008945
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003007 | Grad Max: 0.076758
  -> Layer: shared_layers.0.bias | Grad Mean: 0.182944 | Grad Max: 0.777063
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002088 | Grad Max: 0.008532
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005743 | Grad Max: 0.005743
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001289 | Grad Max: 0.209590
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024018 | Grad Max: 1.184287
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.006856
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011197 | Grad Max: 0.042113
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000536
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002485 | Grad Max: 0.005520
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000225
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000795 | Grad Max: 0.002069
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001494 | Grad Max: 0.003022
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023738 | Grad Max: 0.023738
[GRADIENT NORM TOTAL] 3.7603

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.365
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5038717  0.49612832] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 372/1676 | B: 305/1551 | C: 259/1789
[LOSS Ex1] A: 0.67203 | B: 0.67432 | C: 0.66894
[LOGITS Ex2 A] Mean Abs: 1.495 | Max: 5.345
[LOSS Ex2] A: 0.24386 | B: 0.39652 | C: 0.36311
** [JOINT LOSS] ** : 1.006262
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005893 | Grad Max: 0.143330
  -> Layer: shared_layers.0.bias | Grad Mean: 0.282829 | Grad Max: 1.372583
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.009033
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010997 | Grad Max: 0.010997
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.196181
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037764 | Grad Max: 1.024901
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000418 | Grad Max: 0.011324
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016991 | Grad Max: 0.062707
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000842
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003770 | Grad Max: 0.008636
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000393
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001206 | Grad Max: 0.003129
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002136 | Grad Max: 0.004650
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035279 | Grad Max: 0.035279
[GRADIENT NORM TOTAL] 5.4894

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.375
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5031238  0.49687618] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 351/1697 | B: 308/1740 | C: 255/1793
[LOSS Ex1] A: 0.67113 | B: 0.67376 | C: 0.67172
[LOGITS Ex2 A] Mean Abs: 1.484 | Max: 6.915
[LOSS Ex2] A: 0.27138 | B: 0.43714 | C: 0.35069
** [JOINT LOSS] ** : 1.025271
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008802 | Grad Max: 0.245253
  -> Layer: shared_layers.0.bias | Grad Mean: 0.426945 | Grad Max: 1.987905
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.009595
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014022 | Grad Max: 0.014022
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003072 | Grad Max: 0.240826
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057419 | Grad Max: 1.316973
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000641 | Grad Max: 0.015104
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025990 | Grad Max: 0.090110
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.001168
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005808 | Grad Max: 0.013120
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000495
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001854 | Grad Max: 0.004561
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003276 | Grad Max: 0.006090
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.054083 | Grad Max: 0.054083
[GRADIENT NORM TOTAL] 8.2265

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.318
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50609255 0.49390745] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.521 | Std: 0.024
[MASKS] A(Pass/Fail): 329/1719 | B: 327/1721 | C: 240/1808
[LOSS Ex1] A: 0.67487 | B: 0.67423 | C: 0.67163
[LOGITS Ex2 A] Mean Abs: 1.464 | Max: 5.399
[LOSS Ex2] A: 0.24963 | B: 0.42029 | C: 0.35817
** [JOINT LOSS] ** : 1.016273
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005459 | Grad Max: 0.123672
  -> Layer: shared_layers.0.bias | Grad Mean: 0.329206 | Grad Max: 1.507023
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001875 | Grad Max: 0.007266
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006443 | Grad Max: 0.006443
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002213 | Grad Max: 0.189990
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041676 | Grad Max: 1.034025
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000470 | Grad Max: 0.012822
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019195 | Grad Max: 0.077233
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000943
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004265 | Grad Max: 0.010196
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000365
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001359 | Grad Max: 0.003242
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002181 | Grad Max: 0.004605
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037738 | Grad Max: 0.037738
[GRADIENT NORM TOTAL] 6.3299

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.219
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51906127 0.48093876] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.521 | Std: 0.024
[MASKS] A(Pass/Fail): 333/1715 | B: 338/1710 | C: 255/1793
[LOSS Ex1] A: 0.67440 | B: 0.67159 | C: 0.67069
[LOGITS Ex2 A] Mean Abs: 1.391 | Max: 5.495
[LOSS Ex2] A: 0.23904 | B: 0.40527 | C: 0.34280
** [JOINT LOSS] ** : 1.001261
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002908 | Grad Max: 0.062837
  -> Layer: shared_layers.0.bias | Grad Mean: 0.129995 | Grad Max: 0.601151
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001928 | Grad Max: 0.007641
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001021 | Grad Max: 0.001021
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000897 | Grad Max: 0.063186
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016896 | Grad Max: 0.347359
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000205 | Grad Max: 0.005721
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008270 | Grad Max: 0.031166
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000442
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001822 | Grad Max: 0.004416
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000194
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000566 | Grad Max: 0.001676
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000968 | Grad Max: 0.002576
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015068 | Grad Max: 0.015068
[GRADIENT NORM TOTAL] 2.4812

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.351
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5775888 0.4224112] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.025
[MASKS] A(Pass/Fail): 369/1679 | B: 307/1549 | C: 273/1775
[LOSS Ex1] A: 0.67228 | B: 0.67421 | C: 0.66785
[LOGITS Ex2 A] Mean Abs: 1.447 | Max: 6.206
[LOSS Ex2] A: 0.21780 | B: 0.40282 | C: 0.33645
** [JOINT LOSS] ** : 0.990476
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002927 | Grad Max: 0.068813
  -> Layer: shared_layers.0.bias | Grad Mean: 0.233869 | Grad Max: 1.039296
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.008729
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007197 | Grad Max: 0.007197
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001507 | Grad Max: 0.113691
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028524 | Grad Max: 0.651593
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.008516
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013674 | Grad Max: 0.054754
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000711
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003007 | Grad Max: 0.007338
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000285
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000936 | Grad Max: 0.002607
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001543 | Grad Max: 0.003600
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025474 | Grad Max: 0.025474
[GRADIENT NORM TOTAL] 4.4319

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.393
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017548  0.49824515] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 343/1705 | B: 308/1740 | C: 255/1793
[LOSS Ex1] A: 0.67525 | B: 0.67365 | C: 0.66883
[LOGITS Ex2 A] Mean Abs: 1.453 | Max: 5.435
[LOSS Ex2] A: 0.24170 | B: 0.42138 | C: 0.34632
** [JOINT LOSS] ** : 1.009042
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001950 | Grad Max: 0.048691
  -> Layer: shared_layers.0.bias | Grad Mean: 0.028358 | Grad Max: 0.201440
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001935 | Grad Max: 0.007708
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005922 | Grad Max: 0.005922
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000301 | Grad Max: 0.039378
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004421 | Grad Max: 0.209209
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002123
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001208 | Grad Max: 0.007498
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000204
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000259 | Grad Max: 0.001736
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000076
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000451
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000361 | Grad Max: 0.001192
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002419 | Grad Max: 0.002419
[GRADIENT NORM TOTAL] 0.6784

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.195
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57136565 0.42863432] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 342/1706 | B: 329/1719 | C: 250/1798
[LOSS Ex1] A: 0.67354 | B: 0.67411 | C: 0.66940
[LOGITS Ex2 A] Mean Abs: 1.481 | Max: 5.206
[LOSS Ex2] A: 0.25827 | B: 0.42602 | C: 0.33902
** [JOINT LOSS] ** : 1.013459
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004544 | Grad Max: 0.120813
  -> Layer: shared_layers.0.bias | Grad Mean: 0.233195 | Grad Max: 1.037590
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001923 | Grad Max: 0.007819
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001018 | Grad Max: 0.001018
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001603 | Grad Max: 0.140401
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029895 | Grad Max: 0.765885
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000331 | Grad Max: 0.009842
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013483 | Grad Max: 0.055523
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000646
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002986 | Grad Max: 0.007161
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000271
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000954 | Grad Max: 0.002293
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001663 | Grad Max: 0.003583
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027286 | Grad Max: 0.027286
[GRADIENT NORM TOTAL] 4.5088

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.259
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54371023 0.4562898 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.025
[MASKS] A(Pass/Fail): 288/1328 | B: 341/1707 | C: 263/1785
[LOSS Ex1] A: 0.67255 | B: 0.67146 | C: 0.66952
[LOGITS Ex2 A] Mean Abs: 1.520 | Max: 6.442
[LOSS Ex2] A: 0.23263 | B: 0.40570 | C: 0.34564
** [JOINT LOSS] ** : 0.999165
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004897 | Grad Max: 0.110447
  -> Layer: shared_layers.0.bias | Grad Mean: 0.221893 | Grad Max: 1.042382
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.008159
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002253 | Grad Max: 0.002253
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001569 | Grad Max: 0.121090
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029158 | Grad Max: 0.646262
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000326 | Grad Max: 0.008435
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013258 | Grad Max: 0.044259
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000696
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002959 | Grad Max: 0.007226
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000279
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000937 | Grad Max: 0.002434
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001479 | Grad Max: 0.003434
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026422 | Grad Max: 0.026422
[GRADIENT NORM TOTAL] 4.2203

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.395
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50698644 0.49301353] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.523 | Std: 0.025
[MASKS] A(Pass/Fail): 361/1687 | B: 308/1548 | C: 262/1786
[LOSS Ex1] A: 0.67277 | B: 0.67409 | C: 0.66831
[LOGITS Ex2 A] Mean Abs: 1.458 | Max: 5.416
[LOSS Ex2] A: 0.24037 | B: 0.39634 | C: 0.36041
** [JOINT LOSS] ** : 1.004096
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001731 | Grad Max: 0.044744
  -> Layer: shared_layers.0.bias | Grad Mean: 0.088693 | Grad Max: 0.447580
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.009185
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012273 | Grad Max: 0.012273
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000689 | Grad Max: 0.059233
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012271 | Grad Max: 0.328681
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.004253
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005018 | Grad Max: 0.024474
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000304
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001089 | Grad Max: 0.003222
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000147
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000341 | Grad Max: 0.001102
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000563 | Grad Max: 0.001917
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009625 | Grad Max: 0.009625
[GRADIENT NORM TOTAL] 1.8493

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.368
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50390667 0.4960933 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.523 | Std: 0.025
[MASKS] A(Pass/Fail): 380/1668 | B: 308/1740 | C: 266/1782
[LOSS Ex1] A: 0.67178 | B: 0.67351 | C: 0.66861
[LOGITS Ex2 A] Mean Abs: 1.451 | Max: 5.265
[LOSS Ex2] A: 0.24000 | B: 0.43203 | C: 0.34466
** [JOINT LOSS] ** : 1.010198
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002331 | Grad Max: 0.059156
  -> Layer: shared_layers.0.bias | Grad Mean: 0.155309 | Grad Max: 0.772081
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.009109
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011073 | Grad Max: 0.011073
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001108 | Grad Max: 0.131922
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020317 | Grad Max: 0.720941
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000227 | Grad Max: 0.005944
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009321 | Grad Max: 0.033575
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000420
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002060 | Grad Max: 0.004946
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000205
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000655 | Grad Max: 0.001760
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001209 | Grad Max: 0.002438
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018914 | Grad Max: 0.018914
[GRADIENT NORM TOTAL] 3.1740

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.378
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50308394 0.49691606] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.025
[MASKS] A(Pass/Fail): 355/1693 | B: 333/1715 | C: 182/1194
[LOSS Ex1] A: 0.67087 | B: 0.67398 | C: 0.66881
[LOGITS Ex2 A] Mean Abs: 1.473 | Max: 5.976
[LOSS Ex2] A: 0.24768 | B: 0.41773 | C: 0.34392
** [JOINT LOSS] ** : 1.007663
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002763 | Grad Max: 0.088366
  -> Layer: shared_layers.0.bias | Grad Mean: 0.086101 | Grad Max: 0.312898
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.007928
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000863 | Grad Max: 0.000863
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000661 | Grad Max: 0.052777
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011397 | Grad Max: 0.299911
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000106 | Grad Max: 0.004105
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004158 | Grad Max: 0.020472
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000338
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000968 | Grad Max: 0.003148
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000127
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000300 | Grad Max: 0.000954
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000429 | Grad Max: 0.001935
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007635 | Grad Max: 0.007635
[GRADIENT NORM TOTAL] 1.6538

[EPOCH SUMMARY] Train Loss: 1.0081

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9761 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9849 -> New: 0.9761)

############################## EPOCH 52/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.320
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50605214 0.49394783] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 338/1710 | B: 344/1704 | C: 256/1792
[LOSS Ex1] A: 0.67464 | B: 0.67132 | C: 0.66985
[LOGITS Ex2 A] Mean Abs: 1.444 | Max: 5.738
[LOSS Ex2] A: 0.24374 | B: 0.39594 | C: 0.34865
** [JOINT LOSS] ** : 1.001379
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.053680
  -> Layer: shared_layers.0.bias | Grad Mean: 0.048953 | Grad Max: 0.209079
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001914 | Grad Max: 0.007299
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002002 | Grad Max: 0.002002
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000425 | Grad Max: 0.048402
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006931 | Grad Max: 0.277079
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003604
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001514 | Grad Max: 0.016461
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000173
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000262 | Grad Max: 0.001974
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000077
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000081 | Grad Max: 0.000455
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000448 | Grad Max: 0.001265
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000725 | Grad Max: 0.000725
[GRADIENT NORM TOTAL] 1.0838

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.221
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51926935 0.48073068] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 341/1707 | B: 310/1546 | C: 278/1770
[LOSS Ex1] A: 0.67414 | B: 0.67394 | C: 0.66780
[LOGITS Ex2 A] Mean Abs: 1.416 | Max: 5.740
[LOSS Ex2] A: 0.23852 | B: 0.38958 | C: 0.34240
** [JOINT LOSS] ** : 0.995461
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002903 | Grad Max: 0.078113
  -> Layer: shared_layers.0.bias | Grad Mean: 0.129526 | Grad Max: 0.535633
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001974 | Grad Max: 0.008751
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006909 | Grad Max: 0.006909
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000918 | Grad Max: 0.153267
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016560 | Grad Max: 0.863587
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.004509
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006695 | Grad Max: 0.026377
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000431
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001519 | Grad Max: 0.004123
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000170
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000470 | Grad Max: 0.001433
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000755 | Grad Max: 0.002363
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012301 | Grad Max: 0.012301
[GRADIENT NORM TOTAL] 2.6861

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.355
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5788781 0.4211219] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.025
[MASKS] A(Pass/Fail): 371/1677 | B: 310/1738 | C: 252/1796
[LOSS Ex1] A: 0.67198 | B: 0.67335 | C: 0.66757
[LOGITS Ex2 A] Mean Abs: 1.474 | Max: 5.149
[LOSS Ex2] A: 0.22332 | B: 0.42433 | C: 0.33215
** [JOINT LOSS] ** : 0.997569
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001852 | Grad Max: 0.042809
  -> Layer: shared_layers.0.bias | Grad Mean: 0.052470 | Grad Max: 0.205765
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.008788
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009730 | Grad Max: 0.009730
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000409 | Grad Max: 0.136052
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006459 | Grad Max: 0.774613
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002440
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001159 | Grad Max: 0.013494
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000155
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000231 | Grad Max: 0.001310
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000089
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000081 | Grad Max: 0.000524
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000493 | Grad Max: 0.001187
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000195 | Grad Max: 0.000195
[GRADIENT NORM TOTAL] 1.5768

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.398
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017345 0.4982655] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.025
[MASKS] A(Pass/Fail): 350/1698 | B: 335/1713 | C: 272/1776
[LOSS Ex1] A: 0.67497 | B: 0.67381 | C: 0.66852
[LOGITS Ex2 A] Mean Abs: 1.486 | Max: 5.325
[LOSS Ex2] A: 0.23687 | B: 0.41176 | C: 0.31996
** [JOINT LOSS] ** : 0.995297
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002615 | Grad Max: 0.058875
  -> Layer: shared_layers.0.bias | Grad Mean: 0.139443 | Grad Max: 0.684730
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001929 | Grad Max: 0.007885
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007187 | Grad Max: 0.007187
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001008 | Grad Max: 0.090277
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018438 | Grad Max: 0.498375
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000210 | Grad Max: 0.007104
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008548 | Grad Max: 0.040166
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000393
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001819 | Grad Max: 0.004317
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000201
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000584 | Grad Max: 0.001497
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000941 | Grad Max: 0.002865
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017094 | Grad Max: 0.017094
[GRADIENT NORM TOTAL] 2.7949

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.198
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5725275  0.42747247] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.025
[MASKS] A(Pass/Fail): 351/1697 | B: 347/1701 | C: 285/1763
[LOSS Ex1] A: 0.67323 | B: 0.67113 | C: 0.66819
[LOGITS Ex2 A] Mean Abs: 1.476 | Max: 5.314
[LOSS Ex2] A: 0.24148 | B: 0.39948 | C: 0.33713
** [JOINT LOSS] ** : 0.996880
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001743 | Grad Max: 0.032996
  -> Layer: shared_layers.0.bias | Grad Mean: 0.044438 | Grad Max: 0.211552
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.008527
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010330 | Grad Max: 0.010330
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000427 | Grad Max: 0.055054
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007386 | Grad Max: 0.301503
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.003047
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002802 | Grad Max: 0.015893
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000251
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000600 | Grad Max: 0.002076
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000101
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000200 | Grad Max: 0.000632
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000470 | Grad Max: 0.001890
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006724 | Grad Max: 0.006724
[GRADIENT NORM TOTAL] 1.0432

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.264
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5445401  0.45545983] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.025
[MASKS] A(Pass/Fail): 295/1321 | B: 310/1546 | C: 277/1771
[LOSS Ex1] A: 0.67220 | B: 0.67376 | C: 0.66725
[LOGITS Ex2 A] Mean Abs: 1.477 | Max: 6.139
[LOSS Ex2] A: 0.23804 | B: 0.40799 | C: 0.34584
** [JOINT LOSS] ** : 1.001694
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006550 | Grad Max: 0.168087
  -> Layer: shared_layers.0.bias | Grad Mean: 0.436391 | Grad Max: 2.054043
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.007778
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001986 | Grad Max: 0.001986
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002837 | Grad Max: 0.254246
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053996 | Grad Max: 1.388732
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000630 | Grad Max: 0.014795
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025994 | Grad Max: 0.090741
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001136
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005679 | Grad Max: 0.012365
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000487
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001787 | Grad Max: 0.004292
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003090 | Grad Max: 0.005582
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050916 | Grad Max: 0.050916
[GRADIENT NORM TOTAL] 8.1977

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.400
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5069848  0.49301517] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.523 | Std: 0.025
[MASKS] A(Pass/Fail): 375/1673 | B: 311/1737 | C: 231/1817
[LOSS Ex1] A: 0.67242 | B: 0.67316 | C: 0.67270
[LOGITS Ex2 A] Mean Abs: 1.461 | Max: 6.300
[LOSS Ex2] A: 0.24286 | B: 0.45576 | C: 0.36073
** [JOINT LOSS] ** : 1.025877
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005581 | Grad Max: 0.161819
  -> Layer: shared_layers.0.bias | Grad Mean: 0.442567 | Grad Max: 2.098195
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.008939
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016120 | Grad Max: 0.016120
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002884 | Grad Max: 0.221121
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054799 | Grad Max: 1.219781
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000624 | Grad Max: 0.016208
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025893 | Grad Max: 0.095369
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001151
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005664 | Grad Max: 0.012210
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000462
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001787 | Grad Max: 0.004038
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003050 | Grad Max: 0.006117
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051169 | Grad Max: 0.051169
[GRADIENT NORM TOTAL] 8.4464

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.372
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5039592  0.49604082] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.523 | Std: 0.025
[MASKS] A(Pass/Fail): 391/1657 | B: 336/1712 | C: 244/1804
[LOSS Ex1] A: 0.67141 | B: 0.67362 | C: 0.67088
[LOGITS Ex2 A] Mean Abs: 1.474 | Max: 5.549
[LOSS Ex2] A: 0.22616 | B: 0.42304 | C: 0.34715
** [JOINT LOSS] ** : 1.004089
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003123 | Grad Max: 0.078578
  -> Layer: shared_layers.0.bias | Grad Mean: 0.221815 | Grad Max: 1.082166
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002140 | Grad Max: 0.009187
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014580 | Grad Max: 0.014580
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001473 | Grad Max: 0.104000
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027388 | Grad Max: 0.560247
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.008338
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012976 | Grad Max: 0.051796
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000604
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002778 | Grad Max: 0.006338
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000261
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000869 | Grad Max: 0.002266
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001567 | Grad Max: 0.003318
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025034 | Grad Max: 0.025034
[GRADIENT NORM TOTAL] 4.2369

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.383
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5030145  0.49698552] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.523 | Std: 0.025
[MASKS] A(Pass/Fail): 365/1683 | B: 350/1698 | C: 260/1788
[LOSS Ex1] A: 0.67049 | B: 0.67093 | C: 0.66801
[LOGITS Ex2 A] Mean Abs: 1.512 | Max: 5.809
[LOSS Ex2] A: 0.26347 | B: 0.40903 | C: 0.33341
** [JOINT LOSS] ** : 1.005115
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007399 | Grad Max: 0.265852
  -> Layer: shared_layers.0.bias | Grad Mean: 0.299978 | Grad Max: 1.291453
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.008358
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000635 | Grad Max: 0.000635
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.208734
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041220 | Grad Max: 1.102593
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.009907
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018052 | Grad Max: 0.059380
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000840
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004069 | Grad Max: 0.009078
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000361
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001288 | Grad Max: 0.003220
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002295 | Grad Max: 0.004685
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037248 | Grad Max: 0.037248
[GRADIENT NORM TOTAL] 5.8836

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.324
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059813  0.49401867] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.025
[MASKS] A(Pass/Fail): 351/1697 | B: 312/1544 | C: 277/1771
[LOSS Ex1] A: 0.67432 | B: 0.67358 | C: 0.66767
[LOGITS Ex2 A] Mean Abs: 1.512 | Max: 5.429
[LOSS Ex2] A: 0.24243 | B: 0.40176 | C: 0.35352
** [JOINT LOSS] ** : 1.004431
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009348 | Grad Max: 0.227232
  -> Layer: shared_layers.0.bias | Grad Mean: 0.477093 | Grad Max: 2.134872
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001930 | Grad Max: 0.007773
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005854 | Grad Max: 0.005854
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003293 | Grad Max: 0.302185
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.061790 | Grad Max: 1.648027
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000695 | Grad Max: 0.016417
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028486 | Grad Max: 0.102069
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001210
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006307 | Grad Max: 0.013910
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000486
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001983 | Grad Max: 0.004648
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003460 | Grad Max: 0.006290
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057070 | Grad Max: 0.057070
[GRADIENT NORM TOTAL] 9.1035

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.223
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5196335 0.4803666] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.024
[MASKS] A(Pass/Fail): 351/1697 | B: 316/1732 | C: 243/1805
[LOSS Ex1] A: 0.67383 | B: 0.67298 | C: 0.67018
[LOGITS Ex2 A] Mean Abs: 1.473 | Max: 5.773
[LOSS Ex2] A: 0.24960 | B: 0.42749 | C: 0.34925
** [JOINT LOSS] ** : 1.014440
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006188 | Grad Max: 0.185117
  -> Layer: shared_layers.0.bias | Grad Mean: 0.303763 | Grad Max: 1.350207
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001860 | Grad Max: 0.007034
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003431 | Grad Max: 0.003431
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.196612
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039135 | Grad Max: 1.044422
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000446 | Grad Max: 0.010277
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018279 | Grad Max: 0.062921
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000854
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004061 | Grad Max: 0.009267
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000342
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001264 | Grad Max: 0.003066
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002126 | Grad Max: 0.004297
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035783 | Grad Max: 0.035783
[GRADIENT NORM TOTAL] 5.8035

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.360
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58064103 0.41935894] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.026
[MASKS] A(Pass/Fail): 383/1665 | B: 336/1712 | C: 244/1804
[LOSS Ex1] A: 0.67162 | B: 0.67346 | C: 0.67010
[LOGITS Ex2 A] Mean Abs: 1.474 | Max: 5.650
[LOSS Ex2] A: 0.22150 | B: 0.41459 | C: 0.34953
** [JOINT LOSS] ** : 1.000268
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.048808
  -> Layer: shared_layers.0.bias | Grad Mean: 0.102569 | Grad Max: 0.512850
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.008469
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009757 | Grad Max: 0.009757
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000783 | Grad Max: 0.071013
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014574 | Grad Max: 0.399306
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.004992
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007020 | Grad Max: 0.030098
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000381
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001533 | Grad Max: 0.003765
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000168
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000472 | Grad Max: 0.001434
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000916 | Grad Max: 0.002550
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014027 | Grad Max: 0.014027
[GRADIENT NORM TOTAL] 2.0570

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.402
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.501602   0.49839798] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.025
[MASKS] A(Pass/Fail): 357/1691 | B: 351/1697 | C: 272/1776
[LOSS Ex1] A: 0.67467 | B: 0.67076 | C: 0.66714
[LOGITS Ex2 A] Mean Abs: 1.485 | Max: 5.419
[LOSS Ex2] A: 0.22025 | B: 0.41202 | C: 0.33983
** [JOINT LOSS] ** : 0.994884
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005426 | Grad Max: 0.149818
  -> Layer: shared_layers.0.bias | Grad Mean: 0.193602 | Grad Max: 1.052187
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002002 | Grad Max: 0.007388
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001038 | Grad Max: 0.001038
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001507 | Grad Max: 0.160670
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027671 | Grad Max: 0.897604
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000315 | Grad Max: 0.007443
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012888 | Grad Max: 0.043275
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000662
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002892 | Grad Max: 0.007354
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000261
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000900 | Grad Max: 0.002252
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001665 | Grad Max: 0.003569
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025479 | Grad Max: 0.025479
[GRADIENT NORM TOTAL] 3.9206

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.202
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57389367 0.42610636] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.025
[MASKS] A(Pass/Fail): 366/1682 | B: 314/1542 | C: 173/1203
[LOSS Ex1] A: 0.67291 | B: 0.67343 | C: 0.66851
[LOGITS Ex2 A] Mean Abs: 1.502 | Max: 5.814
[LOSS Ex2] A: 0.23955 | B: 0.38610 | C: 0.34194
** [JOINT LOSS] ** : 0.994144
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001828 | Grad Max: 0.031693
  -> Layer: shared_layers.0.bias | Grad Mean: 0.070200 | Grad Max: 0.250505
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001956 | Grad Max: 0.007925
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002019 | Grad Max: 0.002019
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000515 | Grad Max: 0.111171
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009071 | Grad Max: 0.600321
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.003877
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003047 | Grad Max: 0.018669
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000248
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000666 | Grad Max: 0.002369
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000105
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000214 | Grad Max: 0.000790
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000475 | Grad Max: 0.001627
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005841 | Grad Max: 0.005841
[GRADIENT NORM TOTAL] 1.6256

[EPOCH SUMMARY] Train Loss: 1.0023

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9734 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9761 -> New: 0.9734)

############################## EPOCH 53/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.268
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5454524  0.45454758] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.026
[MASKS] A(Pass/Fail): 305/1311 | B: 320/1728 | C: 247/1801
[LOSS Ex1] A: 0.67189 | B: 0.67282 | C: 0.67102
[LOGITS Ex2 A] Mean Abs: 1.533 | Max: 6.024
[LOSS Ex2] A: 0.23107 | B: 0.42188 | C: 0.34147
** [JOINT LOSS] ** : 1.003383
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001698 | Grad Max: 0.038736
  -> Layer: shared_layers.0.bias | Grad Mean: 0.108362 | Grad Max: 0.486169
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.007978
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000791 | Grad Max: 0.000791
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000728 | Grad Max: 0.080964
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013374 | Grad Max: 0.459094
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000148 | Grad Max: 0.005074
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006072 | Grad Max: 0.026319
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000335
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001299 | Grad Max: 0.003304
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000148
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000391 | Grad Max: 0.001176
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000478 | Grad Max: 0.001890
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009524 | Grad Max: 0.009524
[GRADIENT NORM TOTAL] 2.1127

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.404
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50690526 0.49309477] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.523 | Std: 0.026
[MASKS] A(Pass/Fail): 392/1656 | B: 340/1708 | C: 245/1803
[LOSS Ex1] A: 0.67211 | B: 0.67330 | C: 0.67070
[LOGITS Ex2 A] Mean Abs: 1.515 | Max: 7.604
[LOSS Ex2] A: 0.23778 | B: 0.41270 | C: 0.35868
** [JOINT LOSS] ** : 1.008424
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004628 | Grad Max: 0.135551
  -> Layer: shared_layers.0.bias | Grad Mean: 0.165700 | Grad Max: 0.815410
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001934 | Grad Max: 0.008080
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005929 | Grad Max: 0.005929
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001202 | Grad Max: 0.142191
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022426 | Grad Max: 0.780649
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000247 | Grad Max: 0.006050
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010189 | Grad Max: 0.035803
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000556
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002272 | Grad Max: 0.005989
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000222
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000723 | Grad Max: 0.001952
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001510 | Grad Max: 0.003162
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022649 | Grad Max: 0.022649
[GRADIENT NORM TOTAL] 3.2172

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.376
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50412 0.49588] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.523 | Std: 0.026
[MASKS] A(Pass/Fail): 402/1646 | B: 352/1696 | C: 266/1782
[LOSS Ex1] A: 0.67108 | B: 0.67058 | C: 0.66618
[LOGITS Ex2 A] Mean Abs: 1.518 | Max: 6.054
[LOSS Ex2] A: 0.22322 | B: 0.40400 | C: 0.34783
** [JOINT LOSS] ** : 0.994294
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002352 | Grad Max: 0.052931
  -> Layer: shared_layers.0.bias | Grad Mean: 0.064573 | Grad Max: 0.265717
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.009013
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007601 | Grad Max: 0.007601
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000475 | Grad Max: 0.063473
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008293 | Grad Max: 0.318792
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.003982
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003478 | Grad Max: 0.016676
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000242
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000800 | Grad Max: 0.002522
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000108
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000246 | Grad Max: 0.000865
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000516 | Grad Max: 0.001596
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006831 | Grad Max: 0.006831
[GRADIENT NORM TOTAL] 1.2373

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.387
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029237 0.4970763] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.523 | Std: 0.026
[MASKS] A(Pass/Fail): 376/1672 | B: 315/1541 | C: 256/1792
[LOSS Ex1] A: 0.67015 | B: 0.67325 | C: 0.66850
[LOGITS Ex2 A] Mean Abs: 1.540 | Max: 6.561
[LOSS Ex2] A: 0.24847 | B: 0.39814 | C: 0.34301
** [JOINT LOSS] ** : 1.000507
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006319 | Grad Max: 0.201275
  -> Layer: shared_layers.0.bias | Grad Mean: 0.282365 | Grad Max: 1.316593
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002046 | Grad Max: 0.009073
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007816 | Grad Max: 0.007816
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002089 | Grad Max: 0.183352
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038773 | Grad Max: 0.964984
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000443 | Grad Max: 0.011170
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018239 | Grad Max: 0.065521
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000783
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004064 | Grad Max: 0.009383
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000334
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001281 | Grad Max: 0.003054
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002271 | Grad Max: 0.004702
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037008 | Grad Max: 0.037008
[GRADIENT NORM TOTAL] 5.4662

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.328
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50592303 0.49407697] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.522 | Std: 0.025
[MASKS] A(Pass/Fail): 364/1684 | B: 326/1722 | C: 266/1782
[LOSS Ex1] A: 0.67404 | B: 0.67264 | C: 0.66808
[LOGITS Ex2 A] Mean Abs: 1.524 | Max: 4.838
[LOSS Ex2] A: 0.23728 | B: 0.43039 | C: 0.34034
** [JOINT LOSS] ** : 1.007587
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006645 | Grad Max: 0.177450
  -> Layer: shared_layers.0.bias | Grad Mean: 0.286637 | Grad Max: 1.421772
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001899 | Grad Max: 0.007155
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000616 | Grad Max: 0.000616
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.139695
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038474 | Grad Max: 0.786546
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000435 | Grad Max: 0.010357
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017928 | Grad Max: 0.064260
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000766
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003991 | Grad Max: 0.008894
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000356
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001240 | Grad Max: 0.003073
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002034 | Grad Max: 0.004078
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034346 | Grad Max: 0.034346
[GRADIENT NORM TOTAL] 5.4399

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.225
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5199569  0.48004308] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.522 | Std: 0.025
[MASKS] A(Pass/Fail): 360/1688 | B: 343/1705 | C: 283/1765
[LOSS Ex1] A: 0.67355 | B: 0.67312 | C: 0.66699
[LOGITS Ex2 A] Mean Abs: 1.460 | Max: 5.216
[LOSS Ex2] A: 0.23797 | B: 0.41488 | C: 0.32467
** [JOINT LOSS] ** : 0.997056
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002701 | Grad Max: 0.068090
  -> Layer: shared_layers.0.bias | Grad Mean: 0.072796 | Grad Max: 0.312151
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001919 | Grad Max: 0.007590
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000737 | Grad Max: 0.000737
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000564 | Grad Max: 0.164828
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009186 | Grad Max: 0.932008
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.003221
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001223 | Grad Max: 0.013449
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000171
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000186 | Grad Max: 0.001469
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000060 | Grad Max: 0.000339
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000432 | Grad Max: 0.001066
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000135 | Grad Max: 0.000135
[GRADIENT NORM TOTAL] 2.0217

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.364
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58222324 0.4177768 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.026
[MASKS] A(Pass/Fail): 402/1646 | B: 355/1693 | C: 269/1779
[LOSS Ex1] A: 0.67129 | B: 0.67039 | C: 0.66725
[LOGITS Ex2 A] Mean Abs: 1.499 | Max: 5.233
[LOSS Ex2] A: 0.21575 | B: 0.41152 | C: 0.34408
** [JOINT LOSS] ** : 0.993426
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003874 | Grad Max: 0.100644
  -> Layer: shared_layers.0.bias | Grad Mean: 0.327660 | Grad Max: 1.403558
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.008781
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010068 | Grad Max: 0.010068
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.304582
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039318 | Grad Max: 1.726431
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000428 | Grad Max: 0.011324
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017998 | Grad Max: 0.069744
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000772
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003910 | Grad Max: 0.008931
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000362
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001200 | Grad Max: 0.003089
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001959 | Grad Max: 0.003684
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032609 | Grad Max: 0.032609
[GRADIENT NORM TOTAL] 6.6143

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.408
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50154495 0.49845502] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.026
[MASKS] A(Pass/Fail): 369/1679 | B: 318/1538 | C: 280/1768
[LOSS Ex1] A: 0.67438 | B: 0.67308 | C: 0.66663
[LOGITS Ex2 A] Mean Abs: 1.504 | Max: 5.237
[LOSS Ex2] A: 0.21793 | B: 0.40506 | C: 0.32470
** [JOINT LOSS] ** : 0.987258
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004840 | Grad Max: 0.144386
  -> Layer: shared_layers.0.bias | Grad Mean: 0.268718 | Grad Max: 1.151254
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001915 | Grad Max: 0.007598
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005852 | Grad Max: 0.005852
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001767 | Grad Max: 0.236241
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032420 | Grad Max: 1.343292
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000355 | Grad Max: 0.008804
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014704 | Grad Max: 0.053384
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000672
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003263 | Grad Max: 0.007210
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000289
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000998 | Grad Max: 0.002548
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001685 | Grad Max: 0.003520
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027347 | Grad Max: 0.027347
[GRADIENT NORM TOTAL] 5.1423

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.205
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5750934  0.42490664] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.026
[MASKS] A(Pass/Fail): 374/1674 | B: 328/1720 | C: 256/1792
[LOSS Ex1] A: 0.67260 | B: 0.67246 | C: 0.66919
[LOGITS Ex2 A] Mean Abs: 1.527 | Max: 5.620
[LOSS Ex2] A: 0.24223 | B: 0.41963 | C: 0.34816
** [JOINT LOSS] ** : 1.008090
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001618 | Grad Max: 0.035603
  -> Layer: shared_layers.0.bias | Grad Mean: 0.056163 | Grad Max: 0.359755
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001966 | Grad Max: 0.008038
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005740 | Grad Max: 0.005740
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000515 | Grad Max: 0.112222
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009064 | Grad Max: 0.618828
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000078 | Grad Max: 0.003067
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003032 | Grad Max: 0.016047
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000279
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000619 | Grad Max: 0.002427
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000111
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000190 | Grad Max: 0.000728
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.001359
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005215 | Grad Max: 0.005215
[GRADIENT NORM TOTAL] 1.5684

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.272
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5462899  0.45371008] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.026
[MASKS] A(Pass/Fail): 316/1300 | B: 344/1704 | C: 272/1776
[LOSS Ex1] A: 0.67156 | B: 0.67294 | C: 0.66686
[LOGITS Ex2 A] Mean Abs: 1.559 | Max: 5.165
[LOSS Ex2] A: 0.21532 | B: 0.41460 | C: 0.33156
** [JOINT LOSS] ** : 0.990946
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002608 | Grad Max: 0.057163
  -> Layer: shared_layers.0.bias | Grad Mean: 0.146260 | Grad Max: 0.733907
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002011 | Grad Max: 0.007853
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001548 | Grad Max: 0.001548
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000995 | Grad Max: 0.091345
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017893 | Grad Max: 0.500112
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.006001
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007613 | Grad Max: 0.035374
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000387
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001634 | Grad Max: 0.004400
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000156
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000504 | Grad Max: 0.001369
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000687 | Grad Max: 0.002466
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013273 | Grad Max: 0.013273
[GRADIENT NORM TOTAL] 2.9090

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.410
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50690055 0.49309945] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.524 | Std: 0.026
[MASKS] A(Pass/Fail): 405/1643 | B: 357/1691 | C: 250/1798
[LOSS Ex1] A: 0.67179 | B: 0.67020 | C: 0.66951
[LOGITS Ex2 A] Mean Abs: 1.540 | Max: 6.984
[LOSS Ex2] A: 0.22023 | B: 0.40272 | C: 0.33757
** [JOINT LOSS] ** : 0.990673
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001998 | Grad Max: 0.079296
  -> Layer: shared_layers.0.bias | Grad Mean: 0.031018 | Grad Max: 0.132181
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.008872
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012678 | Grad Max: 0.012678
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000353 | Grad Max: 0.043142
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005686 | Grad Max: 0.240059
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.003210
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001829 | Grad Max: 0.012064
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000309
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000403 | Grad Max: 0.002004
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000095
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000119 | Grad Max: 0.000650
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000459 | Grad Max: 0.001399
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003038 | Grad Max: 0.003038
[GRADIENT NORM TOTAL] 0.8616

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.381
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.504224   0.49577603] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.524 | Std: 0.026
[MASKS] A(Pass/Fail): 410/1638 | B: 320/1536 | C: 269/1779
[LOSS Ex1] A: 0.67073 | B: 0.67290 | C: 0.66718
[LOGITS Ex2 A] Mean Abs: 1.524 | Max: 5.154
[LOSS Ex2] A: 0.21077 | B: 0.38703 | C: 0.33202
** [JOINT LOSS] ** : 0.980212
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001428 | Grad Max: 0.040353
  -> Layer: shared_layers.0.bias | Grad Mean: 0.022715 | Grad Max: 0.145399
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.008988
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012033 | Grad Max: 0.012033
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000316 | Grad Max: 0.044878
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004894 | Grad Max: 0.239344
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.003072
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001149 | Grad Max: 0.010954
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000172
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000191 | Grad Max: 0.001283
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000067 | Grad Max: 0.000457
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000561 | Grad Max: 0.001418
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001355 | Grad Max: 0.001355
[GRADIENT NORM TOTAL] 0.7812

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.391
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50284535 0.49715468] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.524 | Std: 0.026
[MASKS] A(Pass/Fail): 386/1662 | B: 331/1717 | C: 240/1808
[LOSS Ex1] A: 0.66979 | B: 0.67226 | C: 0.67087
[LOGITS Ex2 A] Mean Abs: 1.544 | Max: 5.890
[LOSS Ex2] A: 0.24598 | B: 0.42041 | C: 0.32464
** [JOINT LOSS] ** : 1.001320
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003274 | Grad Max: 0.124731
  -> Layer: shared_layers.0.bias | Grad Mean: 0.056294 | Grad Max: 0.212471
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.009214
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013324 | Grad Max: 0.013324
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000622 | Grad Max: 0.085610
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010090 | Grad Max: 0.472167
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.003576
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003355 | Grad Max: 0.020304
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000270
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000813 | Grad Max: 0.002604
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000130
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000260 | Grad Max: 0.000926
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000416 | Grad Max: 0.001494
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007364 | Grad Max: 0.007364
[GRADIENT NORM TOTAL] 1.5133

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.331
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50582266 0.49417734] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.026
[MASKS] A(Pass/Fail): 373/1675 | B: 346/1702 | C: 186/1190
[LOSS Ex1] A: 0.67373 | B: 0.67273 | C: 0.66643
[LOGITS Ex2 A] Mean Abs: 1.505 | Max: 5.847
[LOSS Ex2] A: 0.22284 | B: 0.42331 | C: 0.34374
** [JOINT LOSS] ** : 1.000926
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003407 | Grad Max: 0.092639
  -> Layer: shared_layers.0.bias | Grad Mean: 0.190223 | Grad Max: 0.864598
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001888 | Grad Max: 0.007099
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000827 | Grad Max: 0.000827
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001324 | Grad Max: 0.162383
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025033 | Grad Max: 0.924810
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000292 | Grad Max: 0.009206
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012005 | Grad Max: 0.050690
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000661
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002605 | Grad Max: 0.006412
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000264
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000802 | Grad Max: 0.002173
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001442 | Grad Max: 0.002849
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023006 | Grad Max: 0.023006
[GRADIENT NORM TOTAL] 3.6809

[EPOCH SUMMARY] Train Loss: 0.9974

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9696 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9734 -> New: 0.9696)

############################## EPOCH 54/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.228
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52036434 0.47963566] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.523 | Std: 0.025
[MASKS] A(Pass/Fail): 379/1669 | B: 357/1691 | C: 265/1783
[LOSS Ex1] A: 0.67323 | B: 0.66997 | C: 0.66715
[LOGITS Ex2 A] Mean Abs: 1.479 | Max: 5.363
[LOSS Ex2] A: 0.23278 | B: 0.40564 | C: 0.34775
** [JOINT LOSS] ** : 0.998837
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003609 | Grad Max: 0.123441
  -> Layer: shared_layers.0.bias | Grad Mean: 0.050923 | Grad Max: 0.250954
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002052 | Grad Max: 0.008137
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001746 | Grad Max: 0.001746
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000535 | Grad Max: 0.059790
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008156 | Grad Max: 0.332946
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.003297
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001622 | Grad Max: 0.014949
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000244
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000317 | Grad Max: 0.001747
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000101 | Grad Max: 0.000502
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000331 | Grad Max: 0.001179
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002952 | Grad Max: 0.002952
[GRADIENT NORM TOTAL] 1.2583

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.369
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58420956 0.41579038] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.027
[MASKS] A(Pass/Fail): 417/1631 | B: 323/1533 | C: 268/1780
[LOSS Ex1] A: 0.67090 | B: 0.67267 | C: 0.66767
[LOGITS Ex2 A] Mean Abs: 1.536 | Max: 6.534
[LOSS Ex2] A: 0.21932 | B: 0.39828 | C: 0.33608
** [JOINT LOSS] ** : 0.988308
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004169 | Grad Max: 0.091656
  -> Layer: shared_layers.0.bias | Grad Mean: 0.211753 | Grad Max: 0.934285
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002023 | Grad Max: 0.008742
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009575 | Grad Max: 0.009575
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001399 | Grad Max: 0.160608
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026198 | Grad Max: 0.855773
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000277 | Grad Max: 0.008389
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011517 | Grad Max: 0.048313
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000571
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002506 | Grad Max: 0.006086
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000221
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000776 | Grad Max: 0.001994
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001336 | Grad Max: 0.003350
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022630 | Grad Max: 0.022630
[GRADIENT NORM TOTAL] 4.1326

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.413
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013467  0.49865335] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.026
[MASKS] A(Pass/Fail): 401/1647 | B: 331/1717 | C: 244/1804
[LOSS Ex1] A: 0.67404 | B: 0.67203 | C: 0.66870
[LOGITS Ex2 A] Mean Abs: 1.547 | Max: 5.341
[LOSS Ex2] A: 0.21557 | B: 0.42174 | C: 0.32804
** [JOINT LOSS] ** : 0.993372
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003079 | Grad Max: 0.076270
  -> Layer: shared_layers.0.bias | Grad Mean: 0.148041 | Grad Max: 0.608153
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001864 | Grad Max: 0.007243
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002354 | Grad Max: 0.002354
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000921 | Grad Max: 0.117147
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017357 | Grad Max: 0.624951
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000182 | Grad Max: 0.004779
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007520 | Grad Max: 0.028570
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000470
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001611 | Grad Max: 0.004208
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000171
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000489 | Grad Max: 0.001423
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000755 | Grad Max: 0.002199
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013482 | Grad Max: 0.013482
[GRADIENT NORM TOTAL] 2.7231

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.210
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57677394 0.42322603] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.026
[MASKS] A(Pass/Fail): 426/1622 | B: 347/1701 | C: 287/1761
[LOSS Ex1] A: 0.67222 | B: 0.67251 | C: 0.66531
[LOGITS Ex2 A] Mean Abs: 1.531 | Max: 5.671
[LOSS Ex2] A: 0.23988 | B: 0.41736 | C: 0.31774
** [JOINT LOSS] ** : 0.995007
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002621 | Grad Max: 0.087867
  -> Layer: shared_layers.0.bias | Grad Mean: 0.208167 | Grad Max: 1.075573
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002024 | Grad Max: 0.008169
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005003 | Grad Max: 0.005003
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001447 | Grad Max: 0.239179
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026887 | Grad Max: 1.338233
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000296 | Grad Max: 0.008550
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012558 | Grad Max: 0.055307
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000529
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002707 | Grad Max: 0.006377
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000225
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000823 | Grad Max: 0.002027
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001323 | Grad Max: 0.002517
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022548 | Grad Max: 0.022548
[GRADIENT NORM TOTAL] 4.4152

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.278
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54743934 0.45256066] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 352/1264 | B: 359/1689 | C: 254/1794
[LOSS Ex1] A: 0.67118 | B: 0.66974 | C: 0.66737
[LOGITS Ex2 A] Mean Abs: 1.552 | Max: 5.568
[LOSS Ex2] A: 0.21672 | B: 0.40943 | C: 0.34192
** [JOINT LOSS] ** : 0.992123
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003322 | Grad Max: 0.109615
  -> Layer: shared_layers.0.bias | Grad Mean: 0.284310 | Grad Max: 1.347645
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.008006
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002747 | Grad Max: 0.002747
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.262886
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035858 | Grad Max: 1.474252
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000403 | Grad Max: 0.010913
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016833 | Grad Max: 0.068274
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000731
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003632 | Grad Max: 0.008028
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000316
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001113 | Grad Max: 0.002847
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001914 | Grad Max: 0.003624
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031472 | Grad Max: 0.031472
[GRADIENT NORM TOTAL] 5.8695

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.415
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067163 0.4932837] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 461/1587 | B: 324/1532 | C: 280/1768
[LOSS Ex1] A: 0.67139 | B: 0.67247 | C: 0.66468
[LOGITS Ex2 A] Mean Abs: 1.570 | Max: 6.589
[LOSS Ex2] A: 0.22507 | B: 0.38593 | C: 0.32489
** [JOINT LOSS] ** : 0.981473
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003968 | Grad Max: 0.147866
  -> Layer: shared_layers.0.bias | Grad Mean: 0.061995 | Grad Max: 0.299099
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.008078
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003549 | Grad Max: 0.003549
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000582 | Grad Max: 0.156323
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008884 | Grad Max: 0.888538
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000059 | Grad Max: 0.003390
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001417 | Grad Max: 0.016398
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000235
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000249 | Grad Max: 0.001242
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000066
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000083 | Grad Max: 0.000431
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000428 | Grad Max: 0.001364
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002745 | Grad Max: 0.002745
[GRADIENT NORM TOTAL] 1.7231

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.385
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044357  0.49556428] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 467/1581 | B: 338/1710 | C: 272/1776
[LOSS Ex1] A: 0.67030 | B: 0.67182 | C: 0.66856
[LOGITS Ex2 A] Mean Abs: 1.585 | Max: 5.457
[LOSS Ex2] A: 0.22455 | B: 0.42065 | C: 0.37523
** [JOINT LOSS] ** : 1.010373
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004899 | Grad Max: 0.116117
  -> Layer: shared_layers.0.bias | Grad Mean: 0.244646 | Grad Max: 1.117230
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.008658
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008555 | Grad Max: 0.008555
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001660 | Grad Max: 0.140354
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031576 | Grad Max: 0.719286
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000352 | Grad Max: 0.010582
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014748 | Grad Max: 0.056234
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000613
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003198 | Grad Max: 0.007219
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000286
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000955 | Grad Max: 0.002438
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001533 | Grad Max: 0.002936
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025043 | Grad Max: 0.025043
[GRADIENT NORM TOTAL] 4.6263

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.396
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026327  0.49736732] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 462/1586 | B: 348/1700 | C: 264/1784
[LOSS Ex1] A: 0.66937 | B: 0.67231 | C: 0.66894
[LOGITS Ex2 A] Mean Abs: 1.572 | Max: 6.376
[LOSS Ex2] A: 0.23283 | B: 0.41003 | C: 0.33421
** [JOINT LOSS] ** : 0.995896
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002196 | Grad Max: 0.054845
  -> Layer: shared_layers.0.bias | Grad Mean: 0.106950 | Grad Max: 0.489642
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002027 | Grad Max: 0.008504
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006525 | Grad Max: 0.006525
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000826 | Grad Max: 0.081543
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014932 | Grad Max: 0.461901
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000148 | Grad Max: 0.006287
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006173 | Grad Max: 0.031304
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000294
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001341 | Grad Max: 0.003352
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000137
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000408 | Grad Max: 0.001104
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000535 | Grad Max: 0.001619
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010428 | Grad Max: 0.010428
[GRADIENT NORM TOTAL] 2.2486

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.335
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056467  0.49435326] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.026
[MASKS] A(Pass/Fail): 438/1610 | B: 363/1685 | C: 258/1790
[LOSS Ex1] A: 0.67339 | B: 0.66954 | C: 0.66785
[LOGITS Ex2 A] Mean Abs: 1.509 | Max: 5.487
[LOSS Ex2] A: 0.22348 | B: 0.39584 | C: 0.33528
** [JOINT LOSS] ** : 0.988463
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007327 | Grad Max: 0.194904
  -> Layer: shared_layers.0.bias | Grad Mean: 0.299635 | Grad Max: 1.234077
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001939 | Grad Max: 0.007038
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001990 | Grad Max: 0.001990
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.288539
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039674 | Grad Max: 1.517904
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000437 | Grad Max: 0.009613
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018101 | Grad Max: 0.064140
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000827
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003980 | Grad Max: 0.009072
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000316
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001216 | Grad Max: 0.002831
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002213 | Grad Max: 0.004123
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034282 | Grad Max: 0.034282
[GRADIENT NORM TOTAL] 5.7060

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.230
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5207709  0.47922912] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.523 | Std: 0.026
[MASKS] A(Pass/Fail): 452/1596 | B: 327/1529 | C: 258/1790
[LOSS Ex1] A: 0.67289 | B: 0.67228 | C: 0.66995
[LOGITS Ex2 A] Mean Abs: 1.482 | Max: 6.280
[LOSS Ex2] A: 0.24207 | B: 0.39248 | C: 0.33389
** [JOINT LOSS] ** : 0.994521
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008260 | Grad Max: 0.237691
  -> Layer: shared_layers.0.bias | Grad Mean: 0.330052 | Grad Max: 1.347302
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.008361
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012424 | Grad Max: 0.012424
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.298153
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042445 | Grad Max: 1.560057
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000467 | Grad Max: 0.010486
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019220 | Grad Max: 0.062898
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000813
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004253 | Grad Max: 0.009445
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000361
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001302 | Grad Max: 0.003124
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002278 | Grad Max: 0.004237
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036398 | Grad Max: 0.036398
[GRADIENT NORM TOTAL] 6.1678

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.374
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5861117  0.41388825] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.027
[MASKS] A(Pass/Fail): 502/1546 | B: 343/1705 | C: 264/1784
[LOSS Ex1] A: 0.67051 | B: 0.67164 | C: 0.66672
[LOGITS Ex2 A] Mean Abs: 1.560 | Max: 7.000
[LOSS Ex2] A: 0.21823 | B: 0.42421 | C: 0.33566
** [JOINT LOSS] ** : 0.995659
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004011 | Grad Max: 0.138884
  -> Layer: shared_layers.0.bias | Grad Mean: 0.070322 | Grad Max: 0.378126
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.008424
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007607 | Grad Max: 0.007607
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000649 | Grad Max: 0.150712
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010752 | Grad Max: 0.819787
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.004068
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003978 | Grad Max: 0.021398
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000266
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000949 | Grad Max: 0.003166
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000119
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000291 | Grad Max: 0.000996
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000549 | Grad Max: 0.001687
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007701 | Grad Max: 0.007701
[GRADIENT NORM TOTAL] 1.7495

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.418
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012262 0.4987738] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 457/1591 | B: 350/1698 | C: 256/1792
[LOSS Ex1] A: 0.67372 | B: 0.67213 | C: 0.66824
[LOGITS Ex2 A] Mean Abs: 1.603 | Max: 5.391
[LOSS Ex2] A: 0.22418 | B: 0.43522 | C: 0.32433
** [JOINT LOSS] ** : 0.999274
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004983 | Grad Max: 0.167502
  -> Layer: shared_layers.0.bias | Grad Mean: 0.486258 | Grad Max: 2.232906
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001936 | Grad Max: 0.007614
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008170 | Grad Max: 0.008170
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003102 | Grad Max: 0.282982
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058480 | Grad Max: 1.593680
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000657 | Grad Max: 0.017556
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027811 | Grad Max: 0.109822
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001088
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005884 | Grad Max: 0.012938
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000460
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001772 | Grad Max: 0.004230
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002849 | Grad Max: 0.004987
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048043 | Grad Max: 0.048043
[GRADIENT NORM TOTAL] 9.4578

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.213
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57814455 0.42185542] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 465/1583 | B: 366/1682 | C: 267/1781
[LOSS Ex1] A: 0.67188 | B: 0.66935 | C: 0.66613
[LOGITS Ex2 A] Mean Abs: 1.611 | Max: 5.577
[LOSS Ex2] A: 0.24083 | B: 0.43190 | C: 0.33835
** [JOINT LOSS] ** : 1.006149
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009386 | Grad Max: 0.213203
  -> Layer: shared_layers.0.bias | Grad Mean: 0.625462 | Grad Max: 2.809701
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.007992
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003301 | Grad Max: 0.003301
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004160 | Grad Max: 0.361935
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.078758 | Grad Max: 1.954845
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000871 | Grad Max: 0.025466
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036788 | Grad Max: 0.157678
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000124 | Grad Max: 0.001567
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007974 | Grad Max: 0.017612
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.000578
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002417 | Grad Max: 0.005635
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004059 | Grad Max: 0.006842
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.067203 | Grad Max: 0.067203
[GRADIENT NORM TOTAL] 12.1140

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.283
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54839236 0.4516076 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.027
[MASKS] A(Pass/Fail): 378/1238 | B: 329/1527 | C: 162/1214
[LOSS Ex1] A: 0.67084 | B: 0.67211 | C: 0.66997
[LOGITS Ex2 A] Mean Abs: 1.633 | Max: 5.443
[LOSS Ex2] A: 0.22069 | B: 0.40273 | C: 0.36722
** [JOINT LOSS] ** : 1.001192
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007105 | Grad Max: 0.147659
  -> Layer: shared_layers.0.bias | Grad Mean: 0.403199 | Grad Max: 1.796261
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001959 | Grad Max: 0.008594
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006393 | Grad Max: 0.006393
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002760 | Grad Max: 0.242311
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052360 | Grad Max: 1.336626
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000582 | Grad Max: 0.014476
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024406 | Grad Max: 0.093882
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.001071
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005273 | Grad Max: 0.012435
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000420
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001591 | Grad Max: 0.003893
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002696 | Grad Max: 0.004817
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043769 | Grad Max: 0.043769
[GRADIENT NORM TOTAL] 7.8455

[EPOCH SUMMARY] Train Loss: 0.9958

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9662 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9696 -> New: 0.9662)

############################## EPOCH 55/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.420
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50663364 0.49336636] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 498/1550 | B: 344/1704 | C: 268/1780
[LOSS Ex1] A: 0.67106 | B: 0.67147 | C: 0.66787
[LOGITS Ex2 A] Mean Abs: 1.571 | Max: 6.800
[LOSS Ex2] A: 0.22213 | B: 0.42043 | C: 0.34683
** [JOINT LOSS] ** : 0.999931
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002518 | Grad Max: 0.059306
  -> Layer: shared_layers.0.bias | Grad Mean: 0.112263 | Grad Max: 0.562740
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002012 | Grad Max: 0.008163
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007483 | Grad Max: 0.007483
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000775 | Grad Max: 0.084069
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013476 | Grad Max: 0.467730
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.005138
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004817 | Grad Max: 0.024322
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000272
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001011 | Grad Max: 0.003090
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000117
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000304 | Grad Max: 0.000925
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000536 | Grad Max: 0.001406
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008298 | Grad Max: 0.008298
[GRADIENT NORM TOTAL] 2.2202

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.389
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5045645  0.49543545] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 486/1562 | B: 351/1697 | C: 237/1811
[LOSS Ex1] A: 0.66996 | B: 0.67198 | C: 0.67001
[LOGITS Ex2 A] Mean Abs: 1.538 | Max: 6.185
[LOSS Ex2] A: 0.21066 | B: 0.42462 | C: 0.35055
** [JOINT LOSS] ** : 0.999262
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003029 | Grad Max: 0.108599
  -> Layer: shared_layers.0.bias | Grad Mean: 0.285188 | Grad Max: 1.373014
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.009042
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014478 | Grad Max: 0.014478
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001948 | Grad Max: 0.193938
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036302 | Grad Max: 1.099221
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000406 | Grad Max: 0.011683
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017166 | Grad Max: 0.072385
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000698
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003675 | Grad Max: 0.008226
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000310
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001120 | Grad Max: 0.002991
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001924 | Grad Max: 0.004048
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031861 | Grad Max: 0.031861
[GRADIENT NORM TOTAL] 5.8286

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.400
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025035  0.49749646] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 479/1569 | B: 367/1681 | C: 261/1787
[LOSS Ex1] A: 0.66904 | B: 0.66920 | C: 0.66656
[LOGITS Ex2 A] Mean Abs: 1.539 | Max: 7.277
[LOSS Ex2] A: 0.23645 | B: 0.40485 | C: 0.33676
** [JOINT LOSS] ** : 0.994290
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003203 | Grad Max: 0.095218
  -> Layer: shared_layers.0.bias | Grad Mean: 0.168527 | Grad Max: 0.843352
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.008568
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005741 | Grad Max: 0.005741
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001235 | Grad Max: 0.141236
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021645 | Grad Max: 0.794843
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000221 | Grad Max: 0.007680
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009162 | Grad Max: 0.047818
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000467
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001899 | Grad Max: 0.004842
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000163
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000572 | Grad Max: 0.001514
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000933 | Grad Max: 0.001934
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015608 | Grad Max: 0.015608
[GRADIENT NORM TOTAL] 3.4140

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.338
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50551486 0.49448508] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.523 | Std: 0.027
[MASKS] A(Pass/Fail): 449/1599 | B: 332/1524 | C: 267/1781
[LOSS Ex1] A: 0.67314 | B: 0.67197 | C: 0.66740
[LOGITS Ex2 A] Mean Abs: 1.564 | Max: 5.390
[LOSS Ex2] A: 0.21381 | B: 0.39428 | C: 0.31886
** [JOINT LOSS] ** : 0.979821
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004613 | Grad Max: 0.122113
  -> Layer: shared_layers.0.bias | Grad Mean: 0.184761 | Grad Max: 0.772406
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001887 | Grad Max: 0.007273
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003147 | Grad Max: 0.003147
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001287 | Grad Max: 0.134309
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023895 | Grad Max: 0.686279
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000244 | Grad Max: 0.008418
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010264 | Grad Max: 0.042376
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000458
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002283 | Grad Max: 0.005245
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000253
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000708 | Grad Max: 0.001925
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001209 | Grad Max: 0.003126
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020535 | Grad Max: 0.020535
[GRADIENT NORM TOTAL] 3.5221

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.232
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5210937  0.47890627] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.523 | Std: 0.026
[MASKS] A(Pass/Fail): 463/1585 | B: 343/1705 | C: 265/1783
[LOSS Ex1] A: 0.67265 | B: 0.67132 | C: 0.66734
[LOGITS Ex2 A] Mean Abs: 1.521 | Max: 5.919
[LOSS Ex2] A: 0.22782 | B: 0.42709 | C: 0.31628
** [JOINT LOSS] ** : 0.994169
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004782 | Grad Max: 0.121377
  -> Layer: shared_layers.0.bias | Grad Mean: 0.260292 | Grad Max: 1.156800
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001894 | Grad Max: 0.007540
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001344 | Grad Max: 0.001344
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001734 | Grad Max: 0.145440
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032446 | Grad Max: 0.771354
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000342 | Grad Max: 0.009389
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014400 | Grad Max: 0.053599
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000675
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003149 | Grad Max: 0.007292
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000278
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000959 | Grad Max: 0.002423
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001513 | Grad Max: 0.003531
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026486 | Grad Max: 0.026486
[GRADIENT NORM TOTAL] 4.9462

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.378
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58758956 0.41241047] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 515/1533 | B: 353/1695 | C: 269/1779
[LOSS Ex1] A: 0.67023 | B: 0.67183 | C: 0.66664
[LOGITS Ex2 A] Mean Abs: 1.546 | Max: 5.848
[LOSS Ex2] A: 0.21451 | B: 0.41618 | C: 0.32045
** [JOINT LOSS] ** : 0.986614
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001368 | Grad Max: 0.031483
  -> Layer: shared_layers.0.bias | Grad Mean: 0.031191 | Grad Max: 0.188174
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.008676
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012512 | Grad Max: 0.012512
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000324 | Grad Max: 0.055501
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005297 | Grad Max: 0.282298
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.002350
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001188 | Grad Max: 0.009825
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000193
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000209 | Grad Max: 0.001739
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000094
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000068 | Grad Max: 0.000470
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000388 | Grad Max: 0.000991
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000654 | Grad Max: 0.000654
[GRADIENT NORM TOTAL] 0.9112

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.422
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50107557 0.4989245 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 475/1573 | B: 367/1681 | C: 231/1817
[LOSS Ex1] A: 0.67348 | B: 0.66904 | C: 0.67021
[LOGITS Ex2 A] Mean Abs: 1.529 | Max: 5.244
[LOSS Ex2] A: 0.22482 | B: 0.40908 | C: 0.32146
** [JOINT LOSS] ** : 0.989363
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005805 | Grad Max: 0.171927
  -> Layer: shared_layers.0.bias | Grad Mean: 0.240105 | Grad Max: 1.058137
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001927 | Grad Max: 0.007388
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006077 | Grad Max: 0.006077
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001706 | Grad Max: 0.164417
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031428 | Grad Max: 0.904450
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.007577
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013804 | Grad Max: 0.049004
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000607
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003062 | Grad Max: 0.006961
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000292
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000925 | Grad Max: 0.002500
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001526 | Grad Max: 0.003124
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024888 | Grad Max: 0.024888
[GRADIENT NORM TOTAL] 4.6365

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.217
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57930404 0.42069596] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 483/1565 | B: 333/1523 | C: 288/1760
[LOSS Ex1] A: 0.67162 | B: 0.67181 | C: 0.66447
[LOGITS Ex2 A] Mean Abs: 1.535 | Max: 5.410
[LOSS Ex2] A: 0.23014 | B: 0.39575 | C: 0.33706
** [JOINT LOSS] ** : 0.990286
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004534 | Grad Max: 0.128877
  -> Layer: shared_layers.0.bias | Grad Mean: 0.180708 | Grad Max: 0.714372
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002018 | Grad Max: 0.007882
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001098 | Grad Max: 0.001098
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001330 | Grad Max: 0.142775
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024705 | Grad Max: 0.800091
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000251 | Grad Max: 0.007226
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010386 | Grad Max: 0.044139
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000464
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002311 | Grad Max: 0.005423
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000197
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000701 | Grad Max: 0.001924
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001148 | Grad Max: 0.002391
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018771 | Grad Max: 0.018771
[GRADIENT NORM TOTAL] 3.6239

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.287
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54917824 0.45082176] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 389/1227 | B: 343/1705 | C: 295/1753
[LOSS Ex1] A: 0.67058 | B: 0.67115 | C: 0.66444
[LOGITS Ex2 A] Mean Abs: 1.611 | Max: 5.441
[LOSS Ex2] A: 0.20764 | B: 0.41479 | C: 0.36391
** [JOINT LOSS] ** : 0.997510
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002915 | Grad Max: 0.075545
  -> Layer: shared_layers.0.bias | Grad Mean: 0.220110 | Grad Max: 1.018529
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.007556
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006316 | Grad Max: 0.006316
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001411 | Grad Max: 0.125342
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026231 | Grad Max: 0.703906
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.008869
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011984 | Grad Max: 0.051104
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000517
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002553 | Grad Max: 0.005931
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000216
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000755 | Grad Max: 0.001896
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000983 | Grad Max: 0.002245
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018893 | Grad Max: 0.018893
[GRADIENT NORM TOTAL] 4.1665

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.423
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5064943 0.4935057] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 506/1542 | B: 353/1695 | C: 254/1794
[LOSS Ex1] A: 0.67079 | B: 0.67167 | C: 0.66775
[LOGITS Ex2 A] Mean Abs: 1.584 | Max: 7.361
[LOSS Ex2] A: 0.22555 | B: 0.41618 | C: 0.32264
** [JOINT LOSS] ** : 0.991527
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004600 | Grad Max: 0.126267
  -> Layer: shared_layers.0.bias | Grad Mean: 0.317400 | Grad Max: 1.593096
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.008422
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010304 | Grad Max: 0.010304
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.185435
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039606 | Grad Max: 1.046603
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000445 | Grad Max: 0.011358
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018871 | Grad Max: 0.072868
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000769
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004098 | Grad Max: 0.009437
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000341
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001252 | Grad Max: 0.003075
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002068 | Grad Max: 0.004267
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034854 | Grad Max: 0.034854
[GRADIENT NORM TOTAL] 6.1534

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.392
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5047814  0.49521858] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 497/1551 | B: 367/1681 | C: 268/1780
[LOSS Ex1] A: 0.66968 | B: 0.66887 | C: 0.66680
[LOGITS Ex2 A] Mean Abs: 1.574 | Max: 5.474
[LOSS Ex2] A: 0.21383 | B: 0.40143 | C: 0.35391
** [JOINT LOSS] ** : 0.991508
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003101 | Grad Max: 0.109556
  -> Layer: shared_layers.0.bias | Grad Mean: 0.087221 | Grad Max: 0.376075
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.009235
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012364 | Grad Max: 0.012364
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000733 | Grad Max: 0.124963
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013083 | Grad Max: 0.706277
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.004285
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005152 | Grad Max: 0.025895
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000328
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001148 | Grad Max: 0.003583
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000115
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000339 | Grad Max: 0.000929
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000436 | Grad Max: 0.001580
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008592 | Grad Max: 0.008592
[GRADIENT NORM TOTAL] 2.0558

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.403
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023418  0.49765816] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 488/1560 | B: 333/1523 | C: 280/1768
[LOSS Ex1] A: 0.66876 | B: 0.67166 | C: 0.66543
[LOGITS Ex2 A] Mean Abs: 1.520 | Max: 6.475
[LOSS Ex2] A: 0.23086 | B: 0.41564 | C: 0.36179
** [JOINT LOSS] ** : 1.004714
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006735 | Grad Max: 0.169788
  -> Layer: shared_layers.0.bias | Grad Mean: 0.488209 | Grad Max: 2.153198
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.008744
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006301 | Grad Max: 0.006301
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003194 | Grad Max: 0.312685
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060452 | Grad Max: 1.728952
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000657 | Grad Max: 0.017315
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028073 | Grad Max: 0.113120
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001177
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006065 | Grad Max: 0.012933
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000469
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001847 | Grad Max: 0.004406
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003104 | Grad Max: 0.006392
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052114 | Grad Max: 0.052114
[GRADIENT NORM TOTAL] 9.5048

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.341
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5053482 0.4946518] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 457/1591 | B: 344/1704 | C: 277/1771
[LOSS Ex1] A: 0.67291 | B: 0.67099 | C: 0.66595
[LOGITS Ex2 A] Mean Abs: 1.487 | Max: 5.724
[LOSS Ex2] A: 0.22237 | B: 0.45174 | C: 0.32769
** [JOINT LOSS] ** : 1.003883
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009995 | Grad Max: 0.210811
  -> Layer: shared_layers.0.bias | Grad Mean: 0.615201 | Grad Max: 2.798245
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001918 | Grad Max: 0.006816
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003427 | Grad Max: 0.003427
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004111 | Grad Max: 0.403556
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.077703 | Grad Max: 2.242881
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000855 | Grad Max: 0.021174
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036248 | Grad Max: 0.136252
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000121 | Grad Max: 0.001467
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007896 | Grad Max: 0.017028
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000595
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002396 | Grad Max: 0.005790
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003881 | Grad Max: 0.007585
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.065588 | Grad Max: 0.065588
[GRADIENT NORM TOTAL] 11.8690

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.234
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52144825 0.47855178] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 471/1577 | B: 354/1694 | C: 168/1208
[LOSS Ex1] A: 0.67243 | B: 0.67152 | C: 0.66800
[LOGITS Ex2 A] Mean Abs: 1.468 | Max: 5.226
[LOSS Ex2] A: 0.22732 | B: 0.42627 | C: 0.33586
** [JOINT LOSS] ** : 1.000464
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006784 | Grad Max: 0.151198
  -> Layer: shared_layers.0.bias | Grad Mean: 0.440531 | Grad Max: 1.939035
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001895 | Grad Max: 0.007706
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000336 | Grad Max: 0.000336
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002864 | Grad Max: 0.336818
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054035 | Grad Max: 1.823129
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000604 | Grad Max: 0.016252
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025634 | Grad Max: 0.103299
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001017
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005576 | Grad Max: 0.011864
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000450
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001693 | Grad Max: 0.004029
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002798 | Grad Max: 0.004956
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046616 | Grad Max: 0.046616
[GRADIENT NORM TOTAL] 8.4295

[EPOCH SUMMARY] Train Loss: 0.9945

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9640 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9662 -> New: 0.9640)

############################## EPOCH 56/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.382
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58903766 0.41096234] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.028
[MASKS] A(Pass/Fail): 525/1523 | B: 368/1680 | C: 275/1773
[LOSS Ex1] A: 0.66996 | B: 0.66872 | C: 0.66713
[LOGITS Ex2 A] Mean Abs: 1.563 | Max: 5.646
[LOSS Ex2] A: 0.20662 | B: 0.39923 | C: 0.33695
** [JOINT LOSS] ** : 0.982867
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003445 | Grad Max: 0.100401
  -> Layer: shared_layers.0.bias | Grad Mean: 0.146184 | Grad Max: 0.632074
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.008186
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006683 | Grad Max: 0.006683
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001066 | Grad Max: 0.117909
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019510 | Grad Max: 0.669744
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000202 | Grad Max: 0.005422
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008413 | Grad Max: 0.029823
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000460
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001866 | Grad Max: 0.004697
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000184
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000578 | Grad Max: 0.001619
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000906 | Grad Max: 0.002648
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016099 | Grad Max: 0.016099
[GRADIENT NORM TOTAL] 3.0094

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.426
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008962 0.4991038] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 492/1556 | B: 335/1521 | C: 264/1784
[LOSS Ex1] A: 0.67325 | B: 0.67152 | C: 0.66794
[LOGITS Ex2 A] Mean Abs: 1.574 | Max: 5.434
[LOSS Ex2] A: 0.20754 | B: 0.39914 | C: 0.32438
** [JOINT LOSS] ** : 0.981254
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003681 | Grad Max: 0.110131
  -> Layer: shared_layers.0.bias | Grad Mean: 0.281395 | Grad Max: 1.288144
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001953 | Grad Max: 0.007934
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009977 | Grad Max: 0.009977
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001907 | Grad Max: 0.167311
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035942 | Grad Max: 0.935129
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000404 | Grad Max: 0.009780
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017253 | Grad Max: 0.062904
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000674
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003747 | Grad Max: 0.008052
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000318
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001147 | Grad Max: 0.003041
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001786 | Grad Max: 0.004269
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031968 | Grad Max: 0.031968
[GRADIENT NORM TOTAL] 5.5779

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.220
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58037364 0.41962636] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 492/1556 | B: 350/1698 | C: 257/1791
[LOSS Ex1] A: 0.67139 | B: 0.67085 | C: 0.66769
[LOGITS Ex2 A] Mean Abs: 1.574 | Max: 5.553
[LOSS Ex2] A: 0.22663 | B: 0.42065 | C: 0.33568
** [JOINT LOSS] ** : 0.997629
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001737 | Grad Max: 0.038690
  -> Layer: shared_layers.0.bias | Grad Mean: 0.089255 | Grad Max: 0.369024
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001979 | Grad Max: 0.007906
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004798 | Grad Max: 0.004798
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000654 | Grad Max: 0.063520
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011413 | Grad Max: 0.348614
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000109 | Grad Max: 0.005321
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004431 | Grad Max: 0.029333
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000282
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000892 | Grad Max: 0.002975
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000122
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000261 | Grad Max: 0.000867
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000364 | Grad Max: 0.001329
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006770 | Grad Max: 0.006770
[GRADIENT NORM TOTAL] 1.8294

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.291
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.549896 0.450104] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 402/1214 | B: 354/1694 | C: 262/1786
[LOSS Ex1] A: 0.67034 | B: 0.67138 | C: 0.66471
[LOGITS Ex2 A] Mean Abs: 1.567 | Max: 6.065
[LOSS Ex2] A: 0.21162 | B: 0.41992 | C: 0.33547
** [JOINT LOSS] ** : 0.991144
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006450 | Grad Max: 0.139263
  -> Layer: shared_layers.0.bias | Grad Mean: 0.371871 | Grad Max: 1.794217
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.007755
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001635 | Grad Max: 0.001635
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002521 | Grad Max: 0.187365
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047381 | Grad Max: 1.044283
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.013181
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022514 | Grad Max: 0.081205
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000948
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004929 | Grad Max: 0.010758
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000385
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001507 | Grad Max: 0.003463
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002596 | Grad Max: 0.004558
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042820 | Grad Max: 0.042820
[GRADIENT NORM TOTAL] 7.0890

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.427
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063848 0.4936152] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 523/1525 | B: 372/1676 | C: 269/1779
[LOSS Ex1] A: 0.67054 | B: 0.66857 | C: 0.66541
[LOGITS Ex2 A] Mean Abs: 1.549 | Max: 7.478
[LOSS Ex2] A: 0.22302 | B: 0.42232 | C: 0.32901
** [JOINT LOSS] ** : 0.992957
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007001 | Grad Max: 0.172973
  -> Layer: shared_layers.0.bias | Grad Mean: 0.373512 | Grad Max: 1.652868
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.008132
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005964 | Grad Max: 0.005964
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002506 | Grad Max: 0.211143
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046842 | Grad Max: 1.159357
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000522 | Grad Max: 0.013269
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021995 | Grad Max: 0.079514
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000940
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004837 | Grad Max: 0.010839
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000373
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001467 | Grad Max: 0.003513
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002313 | Grad Max: 0.004150
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039560 | Grad Max: 0.039560
[GRADIENT NORM TOTAL] 7.0920

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.396
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5049813 0.4950187] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 520/1528 | B: 336/1520 | C: 277/1771
[LOSS Ex1] A: 0.66941 | B: 0.67138 | C: 0.66429
[LOGITS Ex2 A] Mean Abs: 1.570 | Max: 5.267
[LOSS Ex2] A: 0.21141 | B: 0.39192 | C: 0.32526
** [JOINT LOSS] ** : 0.977893
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.049330
  -> Layer: shared_layers.0.bias | Grad Mean: 0.126740 | Grad Max: 0.574279
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.008835
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007731 | Grad Max: 0.007731
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000846 | Grad Max: 0.144941
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015059 | Grad Max: 0.812634
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000150 | Grad Max: 0.005020
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006364 | Grad Max: 0.030090
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000301
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001362 | Grad Max: 0.003570
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000153
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000406 | Grad Max: 0.001330
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000624 | Grad Max: 0.001832
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010111 | Grad Max: 0.010111
[GRADIENT NORM TOTAL] 2.5904

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.407
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50220096 0.497799  ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 503/1545 | B: 351/1697 | C: 286/1762
[LOSS Ex1] A: 0.66850 | B: 0.67071 | C: 0.66554
[LOGITS Ex2 A] Mean Abs: 1.612 | Max: 6.825
[LOSS Ex2] A: 0.24349 | B: 0.42401 | C: 0.34641
** [JOINT LOSS] ** : 1.006221
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009104 | Grad Max: 0.256061
  -> Layer: shared_layers.0.bias | Grad Mean: 0.486404 | Grad Max: 2.165890
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.008990
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009262 | Grad Max: 0.009262
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003384 | Grad Max: 0.251605
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.063290 | Grad Max: 1.376480
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000683 | Grad Max: 0.016234
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028959 | Grad Max: 0.105601
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001147
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006389 | Grad Max: 0.013754
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000525
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001941 | Grad Max: 0.004881
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003074 | Grad Max: 0.005371
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052688 | Grad Max: 0.052688
[GRADIENT NORM TOTAL] 9.2630

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5052187 0.4947813] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.028
[MASKS] A(Pass/Fail): 473/1575 | B: 354/1694 | C: 272/1776
[LOSS Ex1] A: 0.67270 | B: 0.67124 | C: 0.66659
[LOGITS Ex2 A] Mean Abs: 1.605 | Max: 5.521
[LOSS Ex2] A: 0.23059 | B: 0.42807 | C: 0.35144
** [JOINT LOSS] ** : 1.006878
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009885 | Grad Max: 0.218709
  -> Layer: shared_layers.0.bias | Grad Mean: 0.594088 | Grad Max: 2.611068
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.006954
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001348 | Grad Max: 0.001348
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003969 | Grad Max: 0.353712
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074847 | Grad Max: 1.898408
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000818 | Grad Max: 0.019453
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034777 | Grad Max: 0.128960
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000115 | Grad Max: 0.001389
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007610 | Grad Max: 0.016977
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000572
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002312 | Grad Max: 0.005368
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003700 | Grad Max: 0.006536
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.063213 | Grad Max: 0.063213
[GRADIENT NORM TOTAL] 11.3504

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.236
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5217529 0.4782471] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.524 | Std: 0.027
[MASKS] A(Pass/Fail): 491/1557 | B: 373/1675 | C: 238/1810
[LOSS Ex1] A: 0.67222 | B: 0.66843 | C: 0.66812
[LOGITS Ex2 A] Mean Abs: 1.546 | Max: 6.228
[LOSS Ex2] A: 0.22304 | B: 0.40475 | C: 0.35399
** [JOINT LOSS] ** : 0.996847
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006111 | Grad Max: 0.139189
  -> Layer: shared_layers.0.bias | Grad Mean: 0.358587 | Grad Max: 1.560079
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001965 | Grad Max: 0.006961
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004946 | Grad Max: 0.004946
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002430 | Grad Max: 0.232498
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045559 | Grad Max: 1.248395
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000497 | Grad Max: 0.012415
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021149 | Grad Max: 0.080397
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000831
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004644 | Grad Max: 0.009689
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000343
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001417 | Grad Max: 0.003333
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002288 | Grad Max: 0.004202
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039222 | Grad Max: 0.039222
[GRADIENT NORM TOTAL] 6.9759

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.385
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59032685 0.40967312] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.029
[MASKS] A(Pass/Fail): 549/1499 | B: 337/1519 | C: 251/1797
[LOSS Ex1] A: 0.66970 | B: 0.67126 | C: 0.66676
[LOGITS Ex2 A] Mean Abs: 1.544 | Max: 5.841
[LOSS Ex2] A: 0.21653 | B: 0.39068 | C: 0.32569
** [JOINT LOSS] ** : 0.980205
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001806 | Grad Max: 0.044658
  -> Layer: shared_layers.0.bias | Grad Mean: 0.132940 | Grad Max: 0.643621
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.008767
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012075 | Grad Max: 0.012075
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000927 | Grad Max: 0.117750
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016787 | Grad Max: 0.667532
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000169 | Grad Max: 0.005898
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007091 | Grad Max: 0.032998
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000377
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001491 | Grad Max: 0.004128
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000174
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000437 | Grad Max: 0.001466
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000600 | Grad Max: 0.001915
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010636 | Grad Max: 0.010636
[GRADIENT NORM TOTAL] 2.7516

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.430
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007908 0.4992092] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 511/1537 | B: 351/1697 | C: 277/1771
[LOSS Ex1] A: 0.67304 | B: 0.67058 | C: 0.66631
[LOGITS Ex2 A] Mean Abs: 1.542 | Max: 5.847
[LOSS Ex2] A: 0.20437 | B: 0.42719 | C: 0.32712
** [JOINT LOSS] ** : 0.989538
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004948 | Grad Max: 0.126977
  -> Layer: shared_layers.0.bias | Grad Mean: 0.319665 | Grad Max: 1.433124
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001989 | Grad Max: 0.007893
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008149 | Grad Max: 0.008149
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.290361
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040743 | Grad Max: 1.643592
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000433 | Grad Max: 0.011482
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018331 | Grad Max: 0.071126
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000706
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004030 | Grad Max: 0.008923
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000326
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001222 | Grad Max: 0.003073
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001925 | Grad Max: 0.003492
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032879 | Grad Max: 0.032879
[GRADIENT NORM TOTAL] 6.5796

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.223
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5813542 0.4186458] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 501/1547 | B: 354/1694 | C: 251/1797
[LOSS Ex1] A: 0.67116 | B: 0.67112 | C: 0.66796
[LOGITS Ex2 A] Mean Abs: 1.546 | Max: 5.915
[LOSS Ex2] A: 0.22988 | B: 0.41475 | C: 0.31904
** [JOINT LOSS] ** : 0.991303
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003777 | Grad Max: 0.079065
  -> Layer: shared_layers.0.bias | Grad Mean: 0.160319 | Grad Max: 0.685577
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001970 | Grad Max: 0.008086
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006813 | Grad Max: 0.006813
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001120 | Grad Max: 0.226405
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020989 | Grad Max: 1.282527
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000209 | Grad Max: 0.005355
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008918 | Grad Max: 0.034701
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000447
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001974 | Grad Max: 0.004767
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000181
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000593 | Grad Max: 0.001634
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000905 | Grad Max: 0.002235
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015620 | Grad Max: 0.015620
[GRADIENT NORM TOTAL] 3.6128

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.294
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5505402 0.4494598] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.028
[MASKS] A(Pass/Fail): 415/1201 | B: 372/1676 | C: 277/1771
[LOSS Ex1] A: 0.67012 | B: 0.66831 | C: 0.66574
[LOGITS Ex2 A] Mean Abs: 1.607 | Max: 5.699
[LOSS Ex2] A: 0.20906 | B: 0.40472 | C: 0.32351
** [JOINT LOSS] ** : 0.980484
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002753 | Grad Max: 0.087881
  -> Layer: shared_layers.0.bias | Grad Mean: 0.231160 | Grad Max: 1.122072
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.008161
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001171 | Grad Max: 0.001171
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001485 | Grad Max: 0.144915
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027642 | Grad Max: 0.819403
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000294 | Grad Max: 0.008847
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012414 | Grad Max: 0.053906
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000527
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002643 | Grad Max: 0.005904
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000210
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000806 | Grad Max: 0.002059
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001223 | Grad Max: 0.002838
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022108 | Grad Max: 0.022108
[GRADIENT NORM TOTAL] 4.5801

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.431
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062857  0.49371427] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.029
[MASKS] A(Pass/Fail): 533/1515 | B: 336/1520 | C: 172/1204
[LOSS Ex1] A: 0.67032 | B: 0.67114 | C: 0.66844
[LOGITS Ex2 A] Mean Abs: 1.592 | Max: 6.032
[LOSS Ex2] A: 0.21979 | B: 0.38447 | C: 0.33849
** [JOINT LOSS] ** : 0.984215
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004106 | Grad Max: 0.099110
  -> Layer: shared_layers.0.bias | Grad Mean: 0.279398 | Grad Max: 1.215695
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001931 | Grad Max: 0.007943
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006125 | Grad Max: 0.006125
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001851 | Grad Max: 0.173503
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034697 | Grad Max: 0.914301
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000384 | Grad Max: 0.011787
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016422 | Grad Max: 0.070407
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000730
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003541 | Grad Max: 0.008033
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000272
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001080 | Grad Max: 0.002610
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001684 | Grad Max: 0.003603
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029826 | Grad Max: 0.029826
[GRADIENT NORM TOTAL] 5.4584

[EPOCH SUMMARY] Train Loss: 0.9900

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9620 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9640 -> New: 0.9620)

############################## EPOCH 57/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.398
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50507283 0.4949272 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.029
[MASKS] A(Pass/Fail): 528/1520 | B: 353/1695 | C: 244/1804
[LOSS Ex1] A: 0.66918 | B: 0.67046 | C: 0.66795
[LOGITS Ex2 A] Mean Abs: 1.558 | Max: 5.676
[LOSS Ex2] A: 0.20707 | B: 0.42018 | C: 0.32352
** [JOINT LOSS] ** : 0.986121
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001510 | Grad Max: 0.055128
  -> Layer: shared_layers.0.bias | Grad Mean: 0.023101 | Grad Max: 0.098332
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.008505
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008072 | Grad Max: 0.008072
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000299 | Grad Max: 0.032802
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.004708 | Grad Max: 0.175928
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.003225
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001160 | Grad Max: 0.013012
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000148
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000208 | Grad Max: 0.001353
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000073
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000063 | Grad Max: 0.000407
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000396 | Grad Max: 0.001158
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000553 | Grad Max: 0.000553
[GRADIENT NORM TOTAL] 0.6772

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.410
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5020897 0.4979103] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.029
[MASKS] A(Pass/Fail): 513/1535 | B: 355/1693 | C: 261/1787
[LOSS Ex1] A: 0.66826 | B: 0.67100 | C: 0.66729
[LOGITS Ex2 A] Mean Abs: 1.523 | Max: 8.216
[LOSS Ex2] A: 0.23214 | B: 0.41263 | C: 0.34119
** [JOINT LOSS] ** : 0.997506
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003054 | Grad Max: 0.080178
  -> Layer: shared_layers.0.bias | Grad Mean: 0.213725 | Grad Max: 0.999366
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.008758
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009759 | Grad Max: 0.009759
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001400 | Grad Max: 0.148110
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026001 | Grad Max: 0.812616
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.009006
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012091 | Grad Max: 0.053587
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000534
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002615 | Grad Max: 0.006006
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000213
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000786 | Grad Max: 0.001929
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001265 | Grad Max: 0.002409
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021720 | Grad Max: 0.021720
[GRADIENT NORM TOTAL] 4.1027

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.346
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50510126 0.49489874] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.028
[MASKS] A(Pass/Fail): 483/1565 | B: 373/1675 | C: 282/1766
[LOSS Ex1] A: 0.67250 | B: 0.66817 | C: 0.66511
[LOGITS Ex2 A] Mean Abs: 1.517 | Max: 6.282
[LOSS Ex2] A: 0.21144 | B: 0.39717 | C: 0.32280
** [JOINT LOSS] ** : 0.979061
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.044460
  -> Layer: shared_layers.0.bias | Grad Mean: 0.118342 | Grad Max: 0.537693
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001998 | Grad Max: 0.006754
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002041 | Grad Max: 0.002041
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000854 | Grad Max: 0.143090
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015346 | Grad Max: 0.783975
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.005519
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007247 | Grad Max: 0.031704
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000441
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001564 | Grad Max: 0.004329
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000161
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000464 | Grad Max: 0.001336
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000761 | Grad Max: 0.002519
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012161 | Grad Max: 0.012161
[GRADIENT NORM TOTAL] 2.4522

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.237
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52197105 0.47802892] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.524 | Std: 0.028
[MASKS] A(Pass/Fail): 494/1554 | B: 336/1520 | C: 288/1760
[LOSS Ex1] A: 0.67202 | B: 0.67099 | C: 0.66349
[LOGITS Ex2 A] Mean Abs: 1.537 | Max: 6.187
[LOSS Ex2] A: 0.23210 | B: 0.39229 | C: 0.31940
** [JOINT LOSS] ** : 0.983434
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006207 | Grad Max: 0.176618
  -> Layer: shared_layers.0.bias | Grad Mean: 0.301464 | Grad Max: 1.278928
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.008240
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005223 | Grad Max: 0.005223
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.193152
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039693 | Grad Max: 1.060018
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000425 | Grad Max: 0.010813
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018050 | Grad Max: 0.068042
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000741
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004021 | Grad Max: 0.008733
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000296
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001241 | Grad Max: 0.002954
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002045 | Grad Max: 0.004403
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035462 | Grad Max: 0.035462
[GRADIENT NORM TOTAL] 5.8858

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.388
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5915564 0.4084436] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.029
[MASKS] A(Pass/Fail): 555/1493 | B: 353/1695 | C: 292/1756
[LOSS Ex1] A: 0.66946 | B: 0.67031 | C: 0.66454
[LOGITS Ex2 A] Mean Abs: 1.581 | Max: 6.431
[LOSS Ex2] A: 0.21854 | B: 0.41860 | C: 0.31809
** [JOINT LOSS] ** : 0.986514
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007273 | Grad Max: 0.225663
  -> Layer: shared_layers.0.bias | Grad Mean: 0.370615 | Grad Max: 1.641627
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002075 | Grad Max: 0.008308
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006946 | Grad Max: 0.006946
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002582 | Grad Max: 0.192960
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047801 | Grad Max: 1.043747
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000516 | Grad Max: 0.013547
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021861 | Grad Max: 0.081119
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000927
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004866 | Grad Max: 0.010851
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000361
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001493 | Grad Max: 0.003707
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002417 | Grad Max: 0.004836
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041620 | Grad Max: 0.041620
[GRADIENT NORM TOTAL] 7.0986

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.433
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50067407 0.4993259 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 515/1533 | B: 356/1692 | C: 270/1778
[LOSS Ex1] A: 0.67283 | B: 0.67085 | C: 0.66478
[LOGITS Ex2 A] Mean Abs: 1.563 | Max: 5.534
[LOSS Ex2] A: 0.20259 | B: 0.40868 | C: 0.33209
** [JOINT LOSS] ** : 0.983936
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001596 | Grad Max: 0.041583
  -> Layer: shared_layers.0.bias | Grad Mean: 0.079974 | Grad Max: 0.424801
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001936 | Grad Max: 0.007319
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003190 | Grad Max: 0.003190
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000648 | Grad Max: 0.097449
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011618 | Grad Max: 0.530799
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.004011
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004646 | Grad Max: 0.022299
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000311
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000982 | Grad Max: 0.003090
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000126
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000299 | Grad Max: 0.001113
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000506 | Grad Max: 0.002069
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007941 | Grad Max: 0.007941
[GRADIENT NORM TOTAL] 1.8511

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.225
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5823173  0.41768274] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.029
[MASKS] A(Pass/Fail): 512/1536 | B: 374/1674 | C: 264/1784
[LOSS Ex1] A: 0.67093 | B: 0.66801 | C: 0.66585
[LOGITS Ex2 A] Mean Abs: 1.537 | Max: 5.647
[LOSS Ex2] A: 0.22342 | B: 0.41257 | C: 0.34938
** [JOINT LOSS] ** : 0.996716
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006182 | Grad Max: 0.131971
  -> Layer: shared_layers.0.bias | Grad Mean: 0.405528 | Grad Max: 1.802816
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.007100
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004461 | Grad Max: 0.004461
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002674 | Grad Max: 0.241907
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050536 | Grad Max: 1.367549
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000562 | Grad Max: 0.013880
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024016 | Grad Max: 0.089027
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000905
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005243 | Grad Max: 0.010781
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000395
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001602 | Grad Max: 0.003724
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002604 | Grad Max: 0.005096
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044114 | Grad Max: 0.044114
[GRADIENT NORM TOTAL] 7.7434

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.298
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5511262  0.44887382] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.029
[MASKS] A(Pass/Fail): 423/1193 | B: 343/1513 | C: 265/1783
[LOSS Ex1] A: 0.66987 | B: 0.67085 | C: 0.66622
[LOGITS Ex2 A] Mean Abs: 1.541 | Max: 5.834
[LOSS Ex2] A: 0.22006 | B: 0.41950 | C: 0.34423
** [JOINT LOSS] ** : 0.996913
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008900 | Grad Max: 0.182869
  -> Layer: shared_layers.0.bias | Grad Mean: 0.550196 | Grad Max: 2.393087
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.008227
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005327 | Grad Max: 0.005327
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003644 | Grad Max: 0.316444
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.068367 | Grad Max: 1.767975
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000746 | Grad Max: 0.019130
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031796 | Grad Max: 0.120224
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001236
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006943 | Grad Max: 0.015014
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000536
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002112 | Grad Max: 0.005057
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003424 | Grad Max: 0.006191
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057931 | Grad Max: 0.057931
[GRADIENT NORM TOTAL] 10.6161

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.434
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062187  0.49378133] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.029
[MASKS] A(Pass/Fail): 541/1507 | B: 355/1693 | C: 251/1797
[LOSS Ex1] A: 0.67007 | B: 0.67016 | C: 0.66886
[LOGITS Ex2 A] Mean Abs: 1.554 | Max: 7.000
[LOSS Ex2] A: 0.21105 | B: 0.42887 | C: 0.34138
** [JOINT LOSS] ** : 0.996796
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006332 | Grad Max: 0.142905
  -> Layer: shared_layers.0.bias | Grad Mean: 0.419623 | Grad Max: 1.840796
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001994 | Grad Max: 0.008158
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007879 | Grad Max: 0.007879
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002798 | Grad Max: 0.310549
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052669 | Grad Max: 1.755409
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000571 | Grad Max: 0.015516
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024488 | Grad Max: 0.092197
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000910
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005357 | Grad Max: 0.011234
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000409
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001625 | Grad Max: 0.003953
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002594 | Grad Max: 0.004514
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044090 | Grad Max: 0.044090
[GRADIENT NORM TOTAL] 8.2847

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.402
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50512266 0.4948774 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.029
[MASKS] A(Pass/Fail): 533/1515 | B: 358/1690 | C: 258/1790
[LOSS Ex1] A: 0.66892 | B: 0.67070 | C: 0.66720
[LOGITS Ex2 A] Mean Abs: 1.588 | Max: 5.663
[LOSS Ex2] A: 0.21516 | B: 0.40303 | C: 0.35304
** [JOINT LOSS] ** : 0.992684
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002509 | Grad Max: 0.062150
  -> Layer: shared_layers.0.bias | Grad Mean: 0.130988 | Grad Max: 0.548308
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002107 | Grad Max: 0.008672
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011959 | Grad Max: 0.011959
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000943 | Grad Max: 0.088968
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017143 | Grad Max: 0.486428
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.006005
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007408 | Grad Max: 0.036569
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000442
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001635 | Grad Max: 0.004901
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000167
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000492 | Grad Max: 0.001266
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000656 | Grad Max: 0.001898
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012672 | Grad Max: 0.012672
[GRADIENT NORM TOTAL] 2.6556

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.413
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50201386 0.49798617] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.029
[MASKS] A(Pass/Fail): 522/1526 | B: 376/1672 | C: 245/1803
[LOSS Ex1] A: 0.66801 | B: 0.66787 | C: 0.66700
[LOGITS Ex2 A] Mean Abs: 1.586 | Max: 5.941
[LOSS Ex2] A: 0.23297 | B: 0.39941 | C: 0.33511
** [JOINT LOSS] ** : 0.990121
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005935 | Grad Max: 0.188742
  -> Layer: shared_layers.0.bias | Grad Mean: 0.287944 | Grad Max: 1.285261
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.008939
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007975 | Grad Max: 0.007975
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002038 | Grad Max: 0.170658
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037646 | Grad Max: 0.892579
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.010202
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017302 | Grad Max: 0.064845
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000782
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003845 | Grad Max: 0.008599
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000322
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001190 | Grad Max: 0.003010
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002008 | Grad Max: 0.004341
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033831 | Grad Max: 0.033831
[GRADIENT NORM TOTAL] 5.6126

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.349
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5050191 0.4949809] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.524 | Std: 0.028
[MASKS] A(Pass/Fail): 494/1554 | B: 343/1513 | C: 285/1763
[LOSS Ex1] A: 0.67229 | B: 0.67071 | C: 0.66487
[LOGITS Ex2 A] Mean Abs: 1.552 | Max: 5.281
[LOSS Ex2] A: 0.20441 | B: 0.38768 | C: 0.31184
** [JOINT LOSS] ** : 0.970599
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001849 | Grad Max: 0.033679
  -> Layer: shared_layers.0.bias | Grad Mean: 0.047245 | Grad Max: 0.226833
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001895 | Grad Max: 0.006587
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004682 | Grad Max: 0.004682
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000377 | Grad Max: 0.138343
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006309 | Grad Max: 0.773839
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.002979
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001545 | Grad Max: 0.016513
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000159
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000285 | Grad Max: 0.001573
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000083
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000091 | Grad Max: 0.000604
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000616 | Grad Max: 0.001587
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002449 | Grad Max: 0.002449
[GRADIENT NORM TOTAL] 1.4744

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.239
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5221809 0.4778191] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.524 | Std: 0.028
[MASKS] A(Pass/Fail): 500/1548 | B: 357/1691 | C: 247/1801
[LOSS Ex1] A: 0.67181 | B: 0.67002 | C: 0.66748
[LOGITS Ex2 A] Mean Abs: 1.492 | Max: 6.098
[LOSS Ex2] A: 0.21761 | B: 0.41682 | C: 0.34140
** [JOINT LOSS] ** : 0.995042
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003260 | Grad Max: 0.082620
  -> Layer: shared_layers.0.bias | Grad Mean: 0.240549 | Grad Max: 1.091910
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001887 | Grad Max: 0.007447
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000359 | Grad Max: 0.000359
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001562 | Grad Max: 0.161894
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029632 | Grad Max: 0.908848
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000335 | Grad Max: 0.009806
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014358 | Grad Max: 0.060036
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000565
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003089 | Grad Max: 0.006661
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000249
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000932 | Grad Max: 0.002433
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001560 | Grad Max: 0.003268
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025519 | Grad Max: 0.025519
[GRADIENT NORM TOTAL] 4.6262

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.392
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59275395 0.40724608] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.029
[MASKS] A(Pass/Fail): 561/1487 | B: 360/1688 | C: 180/1196
[LOSS Ex1] A: 0.66922 | B: 0.67055 | C: 0.66484
[LOGITS Ex2 A] Mean Abs: 1.567 | Max: 5.673
[LOSS Ex2] A: 0.21282 | B: 0.41856 | C: 0.31154
** [JOINT LOSS] ** : 0.982515
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002538 | Grad Max: 0.053101
  -> Layer: shared_layers.0.bias | Grad Mean: 0.099945 | Grad Max: 0.430139
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.009164
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014740 | Grad Max: 0.014740
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000674 | Grad Max: 0.096188
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012289 | Grad Max: 0.491487
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000138 | Grad Max: 0.004602
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005884 | Grad Max: 0.029676
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000363
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001273 | Grad Max: 0.003375
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000143
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000383 | Grad Max: 0.001088
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000735 | Grad Max: 0.002044
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011220 | Grad Max: 0.011220
[GRADIENT NORM TOTAL] 1.9474

[EPOCH SUMMARY] Train Loss: 0.9884

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9651 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 58/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.437
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50061876 0.49938124] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.029
[MASKS] A(Pass/Fail): 522/1526 | B: 376/1672 | C: 268/1780
[LOSS Ex1] A: 0.67261 | B: 0.66770 | C: 0.66579
[LOGITS Ex2 A] Mean Abs: 1.592 | Max: 5.415
[LOSS Ex2] A: 0.21236 | B: 0.40148 | C: 0.33980
** [JOINT LOSS] ** : 0.986582
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003416 | Grad Max: 0.119319
  -> Layer: shared_layers.0.bias | Grad Mean: 0.324266 | Grad Max: 1.454116
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001992 | Grad Max: 0.007294
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004307 | Grad Max: 0.004307
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.188603
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039823 | Grad Max: 1.061224
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000442 | Grad Max: 0.013082
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019190 | Grad Max: 0.077629
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000868
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004147 | Grad Max: 0.009618
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000329
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001241 | Grad Max: 0.003035
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001838 | Grad Max: 0.003663
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033190 | Grad Max: 0.033190
[GRADIENT NORM TOTAL] 6.3859

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.228
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5832424  0.41675755] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.029
[MASKS] A(Pass/Fail): 514/1534 | B: 345/1511 | C: 269/1779
[LOSS Ex1] A: 0.67069 | B: 0.67055 | C: 0.66525
[LOGITS Ex2 A] Mean Abs: 1.610 | Max: 6.016
[LOSS Ex2] A: 0.22422 | B: 0.39694 | C: 0.31809
** [JOINT LOSS] ** : 0.981913
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004750 | Grad Max: 0.113877
  -> Layer: shared_layers.0.bias | Grad Mean: 0.310633 | Grad Max: 1.329426
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001958 | Grad Max: 0.007948
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002724 | Grad Max: 0.002724
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.205885
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039947 | Grad Max: 1.155029
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000441 | Grad Max: 0.012274
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019038 | Grad Max: 0.076296
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000755
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004138 | Grad Max: 0.009094
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000287
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001250 | Grad Max: 0.002826
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001978 | Grad Max: 0.004070
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034092 | Grad Max: 0.034092
[GRADIENT NORM TOTAL] 6.1646

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.302
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55172366 0.44827637] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.029
[MASKS] A(Pass/Fail): 427/1189 | B: 359/1689 | C: 292/1756
[LOSS Ex1] A: 0.66964 | B: 0.66985 | C: 0.66395
[LOGITS Ex2 A] Mean Abs: 1.614 | Max: 6.439
[LOSS Ex2] A: 0.20043 | B: 0.41482 | C: 0.31626
** [JOINT LOSS] ** : 0.978315
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001445 | Grad Max: 0.037761
  -> Layer: shared_layers.0.bias | Grad Mean: 0.058765 | Grad Max: 0.380012
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.007899
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002546 | Grad Max: 0.002546
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000413 | Grad Max: 0.096327
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007008 | Grad Max: 0.549876
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000059 | Grad Max: 0.002705
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002245 | Grad Max: 0.015795
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000190
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000487 | Grad Max: 0.002366
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000089
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000150 | Grad Max: 0.000686
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000429 | Grad Max: 0.001582
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003903 | Grad Max: 0.003903
[GRADIENT NORM TOTAL] 1.2811

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.439
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50619185 0.49380815] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 551/1497 | B: 361/1687 | C: 257/1791
[LOSS Ex1] A: 0.66982 | B: 0.67038 | C: 0.66727
[LOGITS Ex2 A] Mean Abs: 1.590 | Max: 6.128
[LOSS Ex2] A: 0.20453 | B: 0.41788 | C: 0.33501
** [JOINT LOSS] ** : 0.988293
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005946 | Grad Max: 0.141598
  -> Layer: shared_layers.0.bias | Grad Mean: 0.362966 | Grad Max: 1.643561
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001948 | Grad Max: 0.007760
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004579 | Grad Max: 0.004579
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002482 | Grad Max: 0.206940
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046920 | Grad Max: 1.170284
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000520 | Grad Max: 0.013182
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022292 | Grad Max: 0.086156
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000873
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004889 | Grad Max: 0.010675
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000366
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001483 | Grad Max: 0.003508
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002465 | Grad Max: 0.004219
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041204 | Grad Max: 0.041204
[GRADIENT NORM TOTAL] 7.0253

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.406
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50516295 0.49483705] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.029
[MASKS] A(Pass/Fail): 540/1508 | B: 379/1669 | C: 264/1784
[LOSS Ex1] A: 0.66864 | B: 0.66753 | C: 0.66703
[LOGITS Ex2 A] Mean Abs: 1.571 | Max: 5.462
[LOSS Ex2] A: 0.19847 | B: 0.40968 | C: 0.33504
** [JOINT LOSS] ** : 0.982131
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005004 | Grad Max: 0.122232
  -> Layer: shared_layers.0.bias | Grad Mean: 0.351743 | Grad Max: 1.597424
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.009098
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014256 | Grad Max: 0.014256
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002329 | Grad Max: 0.204194
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043880 | Grad Max: 1.162877
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000482 | Grad Max: 0.013218
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020734 | Grad Max: 0.082758
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000886
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004534 | Grad Max: 0.009959
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000354
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001373 | Grad Max: 0.003410
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002224 | Grad Max: 0.004201
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037215 | Grad Max: 0.037215
[GRADIENT NORM TOTAL] 6.7174

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.417
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5019931 0.4980069] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.029
[MASKS] A(Pass/Fail): 528/1520 | B: 348/1508 | C: 278/1770
[LOSS Ex1] A: 0.66773 | B: 0.67039 | C: 0.66371
[LOGITS Ex2 A] Mean Abs: 1.575 | Max: 6.758
[LOSS Ex2] A: 0.22272 | B: 0.38611 | C: 0.32995
** [JOINT LOSS] ** : 0.980203
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003300 | Grad Max: 0.112196
  -> Layer: shared_layers.0.bias | Grad Mean: 0.087179 | Grad Max: 0.373124
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.009346
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013761 | Grad Max: 0.013761
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000706 | Grad Max: 0.122353
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011370 | Grad Max: 0.689542
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000094 | Grad Max: 0.004435
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003444 | Grad Max: 0.024544
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000244
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000604 | Grad Max: 0.002771
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000097
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000163 | Grad Max: 0.000772
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000293 | Grad Max: 0.001122
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003828 | Grad Max: 0.003828
[GRADIENT NORM TOTAL] 1.8556

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.352
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5050011 0.4949989] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.029
[MASKS] A(Pass/Fail): 505/1543 | B: 359/1689 | C: 254/1794
[LOSS Ex1] A: 0.67204 | B: 0.66968 | C: 0.66678
[LOGITS Ex2 A] Mean Abs: 1.598 | Max: 5.595
[LOSS Ex2] A: 0.20967 | B: 0.42350 | C: 0.33856
** [JOINT LOSS] ** : 0.993411
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004415 | Grad Max: 0.108618
  -> Layer: shared_layers.0.bias | Grad Mean: 0.323027 | Grad Max: 1.420796
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001897 | Grad Max: 0.006998
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002453 | Grad Max: 0.002453
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.174121
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039217 | Grad Max: 0.982516
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000427 | Grad Max: 0.010742
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018382 | Grad Max: 0.075636
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000704
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003990 | Grad Max: 0.008808
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000327
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001202 | Grad Max: 0.002880
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001909 | Grad Max: 0.003746
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032802 | Grad Max: 0.032802
[GRADIENT NORM TOTAL] 6.2056

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.241
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5224113  0.47758865] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.028
[MASKS] A(Pass/Fail): 503/1545 | B: 363/1685 | C: 275/1773
[LOSS Ex1] A: 0.67156 | B: 0.67021 | C: 0.66455
[LOGITS Ex2 A] Mean Abs: 1.585 | Max: 6.081
[LOSS Ex2] A: 0.22705 | B: 0.42560 | C: 0.30644
** [JOINT LOSS] ** : 0.988471
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005743 | Grad Max: 0.143655
  -> Layer: shared_layers.0.bias | Grad Mean: 0.397647 | Grad Max: 1.976695
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001937 | Grad Max: 0.006825
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006430 | Grad Max: 0.006430
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002606 | Grad Max: 0.257885
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049436 | Grad Max: 1.412772
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.014849
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023211 | Grad Max: 0.094558
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000960
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005040 | Grad Max: 0.010857
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000369
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001515 | Grad Max: 0.003584
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002401 | Grad Max: 0.004539
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040949 | Grad Max: 0.040949
[GRADIENT NORM TOTAL] 7.8313

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.396
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59413296 0.4058671 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.030
[MASKS] A(Pass/Fail): 571/1477 | B: 381/1667 | C: 262/1786
[LOSS Ex1] A: 0.66894 | B: 0.66736 | C: 0.66605
[LOGITS Ex2 A] Mean Abs: 1.609 | Max: 6.271
[LOSS Ex2] A: 0.20450 | B: 0.38862 | C: 0.33065
** [JOINT LOSS] ** : 0.975372
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002617 | Grad Max: 0.108151
  -> Layer: shared_layers.0.bias | Grad Mean: 0.229687 | Grad Max: 1.209180
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.008398
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007608 | Grad Max: 0.007608
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001540 | Grad Max: 0.178276
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028897 | Grad Max: 1.004770
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.009319
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013497 | Grad Max: 0.058976
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000573
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002896 | Grad Max: 0.006341
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000226
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000877 | Grad Max: 0.002174
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001393 | Grad Max: 0.003369
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024407 | Grad Max: 0.024407
[GRADIENT NORM TOTAL] 4.7253

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.442
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50058275 0.49941725] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.029
[MASKS] A(Pass/Fail): 529/1519 | B: 350/1506 | C: 261/1787
[LOSS Ex1] A: 0.67236 | B: 0.67022 | C: 0.66770
[LOGITS Ex2 A] Mean Abs: 1.583 | Max: 5.645
[LOSS Ex2] A: 0.21779 | B: 0.39808 | C: 0.34802
** [JOINT LOSS] ** : 0.991393
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007686 | Grad Max: 0.235881
  -> Layer: shared_layers.0.bias | Grad Mean: 0.333056 | Grad Max: 1.379103
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001870 | Grad Max: 0.007243
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004975 | Grad Max: 0.004975
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002355 | Grad Max: 0.197010
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043053 | Grad Max: 0.992093
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000466 | Grad Max: 0.010765
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019676 | Grad Max: 0.070452
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000742
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004400 | Grad Max: 0.009282
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000367
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001340 | Grad Max: 0.003304
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002252 | Grad Max: 0.004526
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037142 | Grad Max: 0.037142
[GRADIENT NORM TOTAL] 6.2214

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.231
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5843062  0.41569382] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.029
[MASKS] A(Pass/Fail): 521/1527 | B: 363/1685 | C: 268/1780
[LOSS Ex1] A: 0.67043 | B: 0.66952 | C: 0.66469
[LOGITS Ex2 A] Mean Abs: 1.567 | Max: 5.932
[LOSS Ex2] A: 0.22198 | B: 0.43160 | C: 0.33056
** [JOINT LOSS] ** : 0.996258
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008717 | Grad Max: 0.227425
  -> Layer: shared_layers.0.bias | Grad Mean: 0.440311 | Grad Max: 1.929415
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.007960
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005873 | Grad Max: 0.005873
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003094 | Grad Max: 0.250056
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057741 | Grad Max: 1.372991
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000633 | Grad Max: 0.015469
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027010 | Grad Max: 0.100097
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001030
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005945 | Grad Max: 0.012534
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000525
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001799 | Grad Max: 0.004515
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002943 | Grad Max: 0.005314
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048776 | Grad Max: 0.048776
[GRADIENT NORM TOTAL] 8.4282

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.306
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5523942  0.44760573] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 430/1186 | B: 363/1685 | C: 243/1805
[LOSS Ex1] A: 0.66937 | B: 0.67006 | C: 0.66677
[LOGITS Ex2 A] Mean Abs: 1.615 | Max: 6.129
[LOSS Ex2] A: 0.20906 | B: 0.41120 | C: 0.31557
** [JOINT LOSS] ** : 0.980680
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004661 | Grad Max: 0.132481
  -> Layer: shared_layers.0.bias | Grad Mean: 0.232357 | Grad Max: 1.119301
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001981 | Grad Max: 0.008278
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003309 | Grad Max: 0.003309
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001631 | Grad Max: 0.128200
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029854 | Grad Max: 0.691799
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000340 | Grad Max: 0.010064
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014467 | Grad Max: 0.059901
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000596
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003217 | Grad Max: 0.007114
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000274
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000976 | Grad Max: 0.002440
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001704 | Grad Max: 0.003343
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027219 | Grad Max: 0.027219
[GRADIENT NORM TOTAL] 4.4225

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.443
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50616175 0.49383825] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 561/1487 | B: 381/1667 | C: 292/1756
[LOSS Ex1] A: 0.66956 | B: 0.66720 | C: 0.66391
[LOGITS Ex2 A] Mean Abs: 1.643 | Max: 7.541
[LOSS Ex2] A: 0.22195 | B: 0.40151 | C: 0.32853
** [JOINT LOSS] ** : 0.984218
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004078 | Grad Max: 0.118318
  -> Layer: shared_layers.0.bias | Grad Mean: 0.350693 | Grad Max: 1.520706
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.008064
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004953 | Grad Max: 0.004953
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.214989
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043049 | Grad Max: 1.209262
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000467 | Grad Max: 0.013453
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020135 | Grad Max: 0.083762
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000768
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004306 | Grad Max: 0.009069
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000338
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001299 | Grad Max: 0.003262
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002131 | Grad Max: 0.004237
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035944 | Grad Max: 0.035944
[GRADIENT NORM TOTAL] 6.9845

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.409
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5052999  0.49470013] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 548/1500 | B: 351/1505 | C: 160/1216
[LOSS Ex1] A: 0.66838 | B: 0.67008 | C: 0.66642
[LOGITS Ex2 A] Mean Abs: 1.645 | Max: 5.342
[LOSS Ex2] A: 0.22865 | B: 0.40543 | C: 0.32851
** [JOINT LOSS] ** : 0.989156
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008463 | Grad Max: 0.219643
  -> Layer: shared_layers.0.bias | Grad Mean: 0.480277 | Grad Max: 2.079160
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002089 | Grad Max: 0.008587
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009592 | Grad Max: 0.009592
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003212 | Grad Max: 0.321003
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060828 | Grad Max: 1.702963
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000656 | Grad Max: 0.016325
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028263 | Grad Max: 0.109033
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001144
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006157 | Grad Max: 0.013282
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000495
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001859 | Grad Max: 0.004598
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003005 | Grad Max: 0.005546
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050432 | Grad Max: 0.050432
[GRADIENT NORM TOTAL] 9.3451

[EPOCH SUMMARY] Train Loss: 0.9855

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9659 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 59/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.420
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50190616 0.4980938 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 535/1513 | B: 363/1685 | C: 269/1779
[LOSS Ex1] A: 0.66747 | B: 0.66938 | C: 0.66409
[LOGITS Ex2 A] Mean Abs: 1.639 | Max: 5.914
[LOSS Ex2] A: 0.23537 | B: 0.41542 | C: 0.30711
** [JOINT LOSS] ** : 0.986281
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008306 | Grad Max: 0.248772
  -> Layer: shared_layers.0.bias | Grad Mean: 0.365890 | Grad Max: 1.569164
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.008916
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004936 | Grad Max: 0.004936
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002577 | Grad Max: 0.237593
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048303 | Grad Max: 1.245499
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000526 | Grad Max: 0.014218
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022499 | Grad Max: 0.086304
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000935
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004980 | Grad Max: 0.010682
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000384
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001508 | Grad Max: 0.003671
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002399 | Grad Max: 0.004367
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040705 | Grad Max: 0.040705
[GRADIENT NORM TOTAL] 7.0861

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.355
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5049357  0.49506432] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.029
[MASKS] A(Pass/Fail): 509/1539 | B: 363/1685 | C: 261/1787
[LOSS Ex1] A: 0.67184 | B: 0.66993 | C: 0.66580
[LOGITS Ex2 A] Mean Abs: 1.572 | Max: 5.329
[LOSS Ex2] A: 0.20829 | B: 0.40754 | C: 0.33243
** [JOINT LOSS] ** : 0.985278
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001981 | Grad Max: 0.050488
  -> Layer: shared_layers.0.bias | Grad Mean: 0.085522 | Grad Max: 0.456120
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.006730
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002700 | Grad Max: 0.002700
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000673 | Grad Max: 0.075989
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012387 | Grad Max: 0.429044
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000139 | Grad Max: 0.004557
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005895 | Grad Max: 0.031890
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000301
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001278 | Grad Max: 0.003270
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000134
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000382 | Grad Max: 0.001127
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000701 | Grad Max: 0.001921
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010686 | Grad Max: 0.010686
[GRADIENT NORM TOTAL] 1.8069

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.243
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52264535 0.47735465] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.029
[MASKS] A(Pass/Fail): 510/1538 | B: 382/1666 | C: 256/1792
[LOSS Ex1] A: 0.67136 | B: 0.66706 | C: 0.66618
[LOGITS Ex2 A] Mean Abs: 1.541 | Max: 5.820
[LOSS Ex2] A: 0.21199 | B: 0.39016 | C: 0.33606
** [JOINT LOSS] ** : 0.980937
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002405 | Grad Max: 0.073472
  -> Layer: shared_layers.0.bias | Grad Mean: 0.187931 | Grad Max: 0.914569
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002018 | Grad Max: 0.007719
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003084 | Grad Max: 0.003084
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001217 | Grad Max: 0.111801
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022120 | Grad Max: 0.625836
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000246 | Grad Max: 0.008410
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010627 | Grad Max: 0.051773
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000424
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002250 | Grad Max: 0.005295
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000179
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000672 | Grad Max: 0.001748
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001038 | Grad Max: 0.002319
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017956 | Grad Max: 0.017956
[GRADIENT NORM TOTAL] 3.6736

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.400
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5952577  0.40474236] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.030
[MASKS] A(Pass/Fail): 573/1475 | B: 352/1504 | C: 281/1767
[LOSS Ex1] A: 0.66870 | B: 0.66994 | C: 0.66438
[LOGITS Ex2 A] Mean Abs: 1.602 | Max: 5.578
[LOSS Ex2] A: 0.19835 | B: 0.38906 | C: 0.32299
** [JOINT LOSS] ** : 0.971143
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002907 | Grad Max: 0.076800
  -> Layer: shared_layers.0.bias | Grad Mean: 0.050677 | Grad Max: 0.255436
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.009016
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015323 | Grad Max: 0.015323
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000516 | Grad Max: 0.124013
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008983 | Grad Max: 0.701592
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000083 | Grad Max: 0.003112
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003188 | Grad Max: 0.019424
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000252
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000717 | Grad Max: 0.002782
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000098
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000223 | Grad Max: 0.000848
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000463 | Grad Max: 0.001853
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006674 | Grad Max: 0.006674
[GRADIENT NORM TOTAL] 1.5243

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.446
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50051963 0.49948034] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 534/1514 | B: 365/1683 | C: 257/1791
[LOSS Ex1] A: 0.67216 | B: 0.66924 | C: 0.66660
[LOGITS Ex2 A] Mean Abs: 1.604 | Max: 6.002
[LOSS Ex2] A: 0.20672 | B: 0.40808 | C: 0.32995
** [JOINT LOSS] ** : 0.984250
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002512 | Grad Max: 0.074480
  -> Layer: shared_layers.0.bias | Grad Mean: 0.034159 | Grad Max: 0.150082
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001969 | Grad Max: 0.007651
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007910 | Grad Max: 0.007910
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000390 | Grad Max: 0.092383
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006026 | Grad Max: 0.516520
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.002230
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001503 | Grad Max: 0.012848
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000210
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000272 | Grad Max: 0.001832
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000083 | Grad Max: 0.000436
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000366 | Grad Max: 0.000970
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001857 | Grad Max: 0.001857
[GRADIENT NORM TOTAL] 1.0655

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.234
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.585241   0.41475895] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 525/1523 | B: 365/1683 | C: 253/1795
[LOSS Ex1] A: 0.67020 | B: 0.66976 | C: 0.66679
[LOGITS Ex2 A] Mean Abs: 1.580 | Max: 5.568
[LOSS Ex2] A: 0.21903 | B: 0.41480 | C: 0.32897
** [JOINT LOSS] ** : 0.989848
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003232 | Grad Max: 0.086449
  -> Layer: shared_layers.0.bias | Grad Mean: 0.091592 | Grad Max: 0.409298
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001921 | Grad Max: 0.007413
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001553 | Grad Max: 0.001553
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000664 | Grad Max: 0.106401
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012166 | Grad Max: 0.560784
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.004407
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005329 | Grad Max: 0.028301
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000292
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001207 | Grad Max: 0.003171
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000157
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000375 | Grad Max: 0.001137
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000766 | Grad Max: 0.001867
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011202 | Grad Max: 0.011202
[GRADIENT NORM TOTAL] 1.8812

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.310
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55303514 0.44696486] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.030
[MASKS] A(Pass/Fail): 437/1179 | B: 384/1664 | C: 272/1776
[LOSS Ex1] A: 0.66913 | B: 0.66686 | C: 0.66351
[LOGITS Ex2 A] Mean Abs: 1.647 | Max: 5.619
[LOSS Ex2] A: 0.20867 | B: 0.38480 | C: 0.31912
** [JOINT LOSS] ** : 0.970699
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001753 | Grad Max: 0.039900
  -> Layer: shared_layers.0.bias | Grad Mean: 0.117040 | Grad Max: 0.470564
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.007548
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002883 | Grad Max: 0.002883
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000812 | Grad Max: 0.104062
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014935 | Grad Max: 0.587553
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000143 | Grad Max: 0.006287
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006162 | Grad Max: 0.031284
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000342
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001347 | Grad Max: 0.003442
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000131
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000408 | Grad Max: 0.001157
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000518 | Grad Max: 0.001881
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010381 | Grad Max: 0.010381
[GRADIENT NORM TOTAL] 2.4788

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.448
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50608456 0.4939154 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.030
[MASKS] A(Pass/Fail): 563/1485 | B: 352/1504 | C: 272/1776
[LOSS Ex1] A: 0.66930 | B: 0.66973 | C: 0.66601
[LOGITS Ex2 A] Mean Abs: 1.635 | Max: 5.615
[LOSS Ex2] A: 0.21004 | B: 0.38352 | C: 0.31689
** [JOINT LOSS] ** : 0.971829
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002779 | Grad Max: 0.071800
  -> Layer: shared_layers.0.bias | Grad Mean: 0.097732 | Grad Max: 0.350975
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.008651
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011567 | Grad Max: 0.011567
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000733 | Grad Max: 0.066162
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013157 | Grad Max: 0.371125
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000136 | Grad Max: 0.004642
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005736 | Grad Max: 0.027772
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000374
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001273 | Grad Max: 0.003523
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000151
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000389 | Grad Max: 0.001303
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000597 | Grad Max: 0.002140
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010821 | Grad Max: 0.010821
[GRADIENT NORM TOTAL] 1.9413

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.411
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.505437 0.494563] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 552/1496 | B: 365/1683 | C: 263/1785
[LOSS Ex1] A: 0.66808 | B: 0.66901 | C: 0.66487
[LOGITS Ex2 A] Mean Abs: 1.598 | Max: 6.199
[LOSS Ex2] A: 0.21230 | B: 0.42833 | C: 0.32712
** [JOINT LOSS] ** : 0.989904
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003177 | Grad Max: 0.106718
  -> Layer: shared_layers.0.bias | Grad Mean: 0.285316 | Grad Max: 1.381896
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.008767
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009330 | Grad Max: 0.009330
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001971 | Grad Max: 0.274313
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036700 | Grad Max: 1.551086
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.010791
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016006 | Grad Max: 0.071640
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000611
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003516 | Grad Max: 0.007412
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000285
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001058 | Grad Max: 0.002865
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001686 | Grad Max: 0.003161
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028322 | Grad Max: 0.028322
[GRADIENT NORM TOTAL] 6.0420

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.423
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50180054 0.4981994 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 540/1508 | B: 367/1681 | C: 276/1772
[LOSS Ex1] A: 0.66716 | B: 0.66953 | C: 0.66375
[LOGITS Ex2 A] Mean Abs: 1.588 | Max: 6.804
[LOSS Ex2] A: 0.22209 | B: 0.42532 | C: 0.32662
** [JOINT LOSS] ** : 0.991490
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004710 | Grad Max: 0.127031
  -> Layer: shared_layers.0.bias | Grad Mean: 0.352627 | Grad Max: 1.621086
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.008367
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001851 | Grad Max: 0.001851
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002370 | Grad Max: 0.313346
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044691 | Grad Max: 1.751634
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.014385
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020851 | Grad Max: 0.089648
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000805
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004449 | Grad Max: 0.009155
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000321
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001318 | Grad Max: 0.003053
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001998 | Grad Max: 0.003643
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034766 | Grad Max: 0.034766
[GRADIENT NORM TOTAL] 7.1427

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.359
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5047899  0.49521014] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.525 | Std: 0.030
[MASKS] A(Pass/Fail): 513/1535 | B: 385/1663 | C: 266/1782
[LOSS Ex1] A: 0.67158 | B: 0.66662 | C: 0.66670
[LOGITS Ex2 A] Mean Abs: 1.576 | Max: 5.798
[LOSS Ex2] A: 0.20169 | B: 0.39306 | C: 0.34866
** [JOINT LOSS] ** : 0.982770
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.041885
  -> Layer: shared_layers.0.bias | Grad Mean: 0.088274 | Grad Max: 0.423648
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001903 | Grad Max: 0.006351
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002944 | Grad Max: 0.002944
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000694 | Grad Max: 0.046195
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012540 | Grad Max: 0.256276
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000137 | Grad Max: 0.004936
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005715 | Grad Max: 0.028117
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000276
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001188 | Grad Max: 0.003397
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000127
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000357 | Grad Max: 0.001138
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000687 | Grad Max: 0.001858
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010452 | Grad Max: 0.010452
[GRADIENT NORM TOTAL] 1.7715

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.246
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5229959  0.47700408] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.029
[MASKS] A(Pass/Fail): 520/1528 | B: 355/1501 | C: 269/1779
[LOSS Ex1] A: 0.67109 | B: 0.66951 | C: 0.66538
[LOGITS Ex2 A] Mean Abs: 1.608 | Max: 5.532
[LOSS Ex2] A: 0.22133 | B: 0.39239 | C: 0.34176
** [JOINT LOSS] ** : 0.987152
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008226 | Grad Max: 0.194417
  -> Layer: shared_layers.0.bias | Grad Mean: 0.499788 | Grad Max: 2.208280
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001878 | Grad Max: 0.006924
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003018 | Grad Max: 0.003018
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003342 | Grad Max: 0.277116
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.062650 | Grad Max: 1.465072
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000674 | Grad Max: 0.019225
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029332 | Grad Max: 0.128707
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001095
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006425 | Grad Max: 0.013907
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000455
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001937 | Grad Max: 0.004543
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003115 | Grad Max: 0.005727
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052521 | Grad Max: 0.052521
[GRADIENT NORM TOTAL] 9.6824

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.404
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59694856 0.40305147] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.030
[MASKS] A(Pass/Fail): 584/1464 | B: 367/1681 | C: 263/1785
[LOSS Ex1] A: 0.66838 | B: 0.66880 | C: 0.66593
[LOGITS Ex2 A] Mean Abs: 1.654 | Max: 5.345
[LOSS Ex2] A: 0.22611 | B: 0.45722 | C: 0.33459
** [JOINT LOSS] ** : 1.007012
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010960 | Grad Max: 0.258725
  -> Layer: shared_layers.0.bias | Grad Mean: 0.743837 | Grad Max: 3.247124
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.008782
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013555 | Grad Max: 0.013555
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004957 | Grad Max: 0.408639
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.093133 | Grad Max: 2.190953
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001013 | Grad Max: 0.028304
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.044143 | Grad Max: 0.193477
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000140 | Grad Max: 0.001728
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009633 | Grad Max: 0.020916
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000061 | Grad Max: 0.000673
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002893 | Grad Max: 0.006921
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004626 | Grad Max: 0.008165
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.078353 | Grad Max: 0.078353
[GRADIENT NORM TOTAL] 14.3904

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.451
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003653  0.49963468] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 543/1505 | B: 369/1679 | C: 193/1183
[LOSS Ex1] A: 0.67188 | B: 0.66934 | C: 0.66198
[LOGITS Ex2 A] Mean Abs: 1.656 | Max: 6.279
[LOSS Ex2] A: 0.20175 | B: 0.43172 | C: 0.33425
** [JOINT LOSS] ** : 0.990303
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006146 | Grad Max: 0.180212
  -> Layer: shared_layers.0.bias | Grad Mean: 0.527581 | Grad Max: 2.361727
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001982 | Grad Max: 0.006453
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004165 | Grad Max: 0.004165
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003384 | Grad Max: 0.320426
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.063873 | Grad Max: 1.821201
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000689 | Grad Max: 0.020652
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030227 | Grad Max: 0.128045
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.001169
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006550 | Grad Max: 0.013938
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000459
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001956 | Grad Max: 0.004711
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002907 | Grad Max: 0.005433
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051238 | Grad Max: 0.051238
[GRADIENT NORM TOTAL] 10.3666

[EPOCH SUMMARY] Train Loss: 0.9849

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9570 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9620 -> New: 0.9570)

############################## EPOCH 60/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.238
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5865094  0.41349065] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 538/1510 | B: 389/1659 | C: 258/1790
[LOSS Ex1] A: 0.66990 | B: 0.66643 | C: 0.66543
[LOGITS Ex2 A] Mean Abs: 1.605 | Max: 5.482
[LOSS Ex2] A: 0.22016 | B: 0.38872 | C: 0.31757
** [JOINT LOSS] ** : 0.976067
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.053151
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134163 | Grad Max: 0.637508
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.007869
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003879 | Grad Max: 0.003879
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000946 | Grad Max: 0.108384
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017019 | Grad Max: 0.605516
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000164 | Grad Max: 0.005506
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007140 | Grad Max: 0.034309
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000305
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001528 | Grad Max: 0.003849
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000134
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000466 | Grad Max: 0.001183
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000718 | Grad Max: 0.002206
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013481 | Grad Max: 0.013481
[GRADIENT NORM TOTAL] 2.7932

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.315
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5538697 0.4461303] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.030
[MASKS] A(Pass/Fail): 454/1162 | B: 359/1497 | C: 270/1778
[LOSS Ex1] A: 0.66882 | B: 0.66933 | C: 0.66527
[LOGITS Ex2 A] Mean Abs: 1.595 | Max: 5.643
[LOSS Ex2] A: 0.21534 | B: 0.40566 | C: 0.33941
** [JOINT LOSS] ** : 0.987943
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008758 | Grad Max: 0.211108
  -> Layer: shared_layers.0.bias | Grad Mean: 0.594218 | Grad Max: 2.431098
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001967 | Grad Max: 0.007737
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000863 | Grad Max: 0.000863
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003821 | Grad Max: 0.391829
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.071978 | Grad Max: 2.161278
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000784 | Grad Max: 0.020520
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034116 | Grad Max: 0.132039
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000108 | Grad Max: 0.001327
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007432 | Grad Max: 0.016487
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000535
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002225 | Grad Max: 0.005170
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003511 | Grad Max: 0.006379
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.060207 | Grad Max: 0.060207
[GRADIENT NORM TOTAL] 11.3327

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.452
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50600886 0.49399117] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 578/1470 | B: 370/1678 | C: 280/1768
[LOSS Ex1] A: 0.66900 | B: 0.66863 | C: 0.66265
[LOGITS Ex2 A] Mean Abs: 1.582 | Max: 7.112
[LOSS Ex2] A: 0.23550 | B: 0.47492 | C: 0.37092
** [JOINT LOSS] ** : 1.027207
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014241 | Grad Max: 0.323871
  -> Layer: shared_layers.0.bias | Grad Mean: 0.851079 | Grad Max: 3.555363
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.008217
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005260 | Grad Max: 0.005260
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005572 | Grad Max: 0.528082
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.105000 | Grad Max: 2.904001
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001132 | Grad Max: 0.027226
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.049238 | Grad Max: 0.184093
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000157 | Grad Max: 0.001818
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010772 | Grad Max: 0.022694
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000069 | Grad Max: 0.000785
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003250 | Grad Max: 0.007317
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.005095 | Grad Max: 0.010904
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.088663 | Grad Max: 0.088663
[GRADIENT NORM TOTAL] 16.2341

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.413
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5055204  0.49447954] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 560/1488 | B: 370/1678 | C: 269/1779
[LOSS Ex1] A: 0.66778 | B: 0.66918 | C: 0.66570
[LOGITS Ex2 A] Mean Abs: 1.564 | Max: 5.905
[LOSS Ex2] A: 0.21821 | B: 0.45370 | C: 0.36443
** [JOINT LOSS] ** : 1.012996
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011302 | Grad Max: 0.253828
  -> Layer: shared_layers.0.bias | Grad Mean: 0.775810 | Grad Max: 3.210457
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.008239
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006923 | Grad Max: 0.006923
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004942 | Grad Max: 0.486739
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.093722 | Grad Max: 2.667166
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001013 | Grad Max: 0.028469
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.044303 | Grad Max: 0.185407
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000140 | Grad Max: 0.001610
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009672 | Grad Max: 0.020095
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000061 | Grad Max: 0.000671
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002912 | Grad Max: 0.006660
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004499 | Grad Max: 0.008952
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.078906 | Grad Max: 0.078906
[GRADIENT NORM TOTAL] 14.8139

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.425
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50172675 0.49827328] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.030
[MASKS] A(Pass/Fail): 546/1502 | B: 390/1658 | C: 278/1770
[LOSS Ex1] A: 0.66687 | B: 0.66626 | C: 0.66422
[LOGITS Ex2 A] Mean Abs: 1.581 | Max: 6.668
[LOSS Ex2] A: 0.23090 | B: 0.41120 | C: 0.33313
** [JOINT LOSS] ** : 0.990860
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005342 | Grad Max: 0.120591
  -> Layer: shared_layers.0.bias | Grad Mean: 0.387335 | Grad Max: 1.611042
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002194 | Grad Max: 0.008668
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007160 | Grad Max: 0.007160
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002417 | Grad Max: 0.284920
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045666 | Grad Max: 1.596665
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000492 | Grad Max: 0.013835
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021626 | Grad Max: 0.095068
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000743
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004687 | Grad Max: 0.009810
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000316
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001404 | Grad Max: 0.003188
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002189 | Grad Max: 0.004163
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038192 | Grad Max: 0.038192
[GRADIENT NORM TOTAL] 7.4954

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.360
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5047223 0.4952777] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 521/1527 | B: 359/1497 | C: 275/1773
[LOSS Ex1] A: 0.67134 | B: 0.66918 | C: 0.66329
[LOGITS Ex2 A] Mean Abs: 1.606 | Max: 6.146
[LOSS Ex2] A: 0.21673 | B: 0.37825 | C: 0.31826
** [JOINT LOSS] ** : 0.972349
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004846 | Grad Max: 0.123933
  -> Layer: shared_layers.0.bias | Grad Mean: 0.232595 | Grad Max: 0.993687
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001965 | Grad Max: 0.007458
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004075 | Grad Max: 0.004075
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001588 | Grad Max: 0.168211
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030045 | Grad Max: 0.872708
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000335 | Grad Max: 0.010894
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014534 | Grad Max: 0.067395
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000585
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003169 | Grad Max: 0.007046
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000235
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000959 | Grad Max: 0.002209
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001542 | Grad Max: 0.003585
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026691 | Grad Max: 0.026691
[GRADIENT NORM TOTAL] 4.5482

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.248
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5232282 0.4767718] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.525 | Std: 0.029
[MASKS] A(Pass/Fail): 527/1521 | B: 373/1675 | C: 249/1799
[LOSS Ex1] A: 0.67087 | B: 0.66849 | C: 0.66591
[LOGITS Ex2 A] Mean Abs: 1.615 | Max: 6.501
[LOSS Ex2] A: 0.22981 | B: 0.43196 | C: 0.32766
** [JOINT LOSS] ** : 0.998231
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008789 | Grad Max: 0.213821
  -> Layer: shared_layers.0.bias | Grad Mean: 0.500862 | Grad Max: 2.172618
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001941 | Grad Max: 0.007395
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002658 | Grad Max: 0.002658
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003311 | Grad Max: 0.290745
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.062490 | Grad Max: 1.485112
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000664 | Grad Max: 0.017013
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028982 | Grad Max: 0.117468
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001102
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006348 | Grad Max: 0.013703
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000472
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001902 | Grad Max: 0.004635
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002958 | Grad Max: 0.005133
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050610 | Grad Max: 0.050610
[GRADIENT NORM TOTAL] 9.5075

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.407
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59814197 0.40185803] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.031
[MASKS] A(Pass/Fail): 589/1459 | B: 370/1678 | C: 264/1784
[LOSS Ex1] A: 0.66813 | B: 0.66904 | C: 0.66537
[LOGITS Ex2 A] Mean Abs: 1.639 | Max: 6.037
[LOSS Ex2] A: 0.21414 | B: 0.41335 | C: 0.34664
** [JOINT LOSS] ** : 0.992224
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006722 | Grad Max: 0.178380
  -> Layer: shared_layers.0.bias | Grad Mean: 0.356520 | Grad Max: 1.542742
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001990 | Grad Max: 0.007594
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000943 | Grad Max: 0.000943
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002455 | Grad Max: 0.214602
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046073 | Grad Max: 1.146435
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000490 | Grad Max: 0.013665
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021337 | Grad Max: 0.087586
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000813
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004699 | Grad Max: 0.010187
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000350
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001411 | Grad Max: 0.003347
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002116 | Grad Max: 0.003998
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036655 | Grad Max: 0.036655
[GRADIENT NORM TOTAL] 6.9180

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.454
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50032395 0.49967608] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 547/1501 | B: 392/1656 | C: 256/1792
[LOSS Ex1] A: 0.67167 | B: 0.66612 | C: 0.66788
[LOGITS Ex2 A] Mean Abs: 1.606 | Max: 5.532
[LOSS Ex2] A: 0.19655 | B: 0.38827 | C: 0.31583
** [JOINT LOSS] ** : 0.968771
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002375 | Grad Max: 0.064160
  -> Layer: shared_layers.0.bias | Grad Mean: 0.053162 | Grad Max: 0.259987
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001920 | Grad Max: 0.007269
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004116 | Grad Max: 0.004116
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000504 | Grad Max: 0.065639
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008512 | Grad Max: 0.358803
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.004105
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002850 | Grad Max: 0.015876
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000229
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000657 | Grad Max: 0.003065
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000127
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000192 | Grad Max: 0.000747
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000582 | Grad Max: 0.001724
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004392 | Grad Max: 0.004392
[GRADIENT NORM TOTAL] 1.3006

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.241
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.587352   0.41264802] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.031
[MASKS] A(Pass/Fail): 542/1506 | B: 360/1496 | C: 289/1759
[LOSS Ex1] A: 0.66968 | B: 0.66904 | C: 0.66191
[LOGITS Ex2 A] Mean Abs: 1.588 | Max: 5.810
[LOSS Ex2] A: 0.21474 | B: 0.38569 | C: 0.33787
** [JOINT LOSS] ** : 0.979643
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002421 | Grad Max: 0.054899
  -> Layer: shared_layers.0.bias | Grad Mean: 0.176122 | Grad Max: 0.769383
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.007947
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004843 | Grad Max: 0.004843
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001130 | Grad Max: 0.116294
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020902 | Grad Max: 0.645543
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000221 | Grad Max: 0.006299
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009664 | Grad Max: 0.040351
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000400
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002111 | Grad Max: 0.005332
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000178
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000626 | Grad Max: 0.001644
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000905 | Grad Max: 0.002067
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015400 | Grad Max: 0.015400
[GRADIENT NORM TOTAL] 3.3746

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.319
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5543638 0.4456362] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 455/1161 | B: 373/1675 | C: 271/1777
[LOSS Ex1] A: 0.66861 | B: 0.66835 | C: 0.66246
[LOGITS Ex2 A] Mean Abs: 1.644 | Max: 5.310
[LOSS Ex2] A: 0.19792 | B: 0.41175 | C: 0.31257
** [JOINT LOSS] ** : 0.973890
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001880 | Grad Max: 0.045277
  -> Layer: shared_layers.0.bias | Grad Mean: 0.037093 | Grad Max: 0.155444
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.007906
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000681 | Grad Max: 0.000681
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000360 | Grad Max: 0.097521
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006009 | Grad Max: 0.532692
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002434
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001250 | Grad Max: 0.012139
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000175
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000206 | Grad Max: 0.001347
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000064
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000061 | Grad Max: 0.000398
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000337 | Grad Max: 0.001022
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000475 | Grad Max: 0.000475
[GRADIENT NORM TOTAL] 1.1833

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.455
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059968  0.49400315] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 580/1468 | B: 372/1676 | C: 254/1794
[LOSS Ex1] A: 0.66878 | B: 0.66888 | C: 0.66530
[LOGITS Ex2 A] Mean Abs: 1.638 | Max: 7.406
[LOSS Ex2] A: 0.20388 | B: 0.41100 | C: 0.31929
** [JOINT LOSS] ** : 0.979043
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002934 | Grad Max: 0.075179
  -> Layer: shared_layers.0.bias | Grad Mean: 0.147261 | Grad Max: 0.664273
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002002 | Grad Max: 0.007454
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003332 | Grad Max: 0.003332
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001055 | Grad Max: 0.105473
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019213 | Grad Max: 0.586025
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000196 | Grad Max: 0.006449
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008555 | Grad Max: 0.039479
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000392
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001907 | Grad Max: 0.004942
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000161
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000585 | Grad Max: 0.001655
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000921 | Grad Max: 0.002889
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016173 | Grad Max: 0.016173
[GRADIENT NORM TOTAL] 2.9799

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.415
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5055698  0.49443012] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 564/1484 | B: 396/1652 | C: 261/1787
[LOSS Ex1] A: 0.66754 | B: 0.66593 | C: 0.66457
[LOGITS Ex2 A] Mean Abs: 1.607 | Max: 5.472
[LOSS Ex2] A: 0.20255 | B: 0.39556 | C: 0.30912
** [JOINT LOSS] ** : 0.968422
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001592 | Grad Max: 0.043004
  -> Layer: shared_layers.0.bias | Grad Mean: 0.041761 | Grad Max: 0.331004
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.008911
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012408 | Grad Max: 0.012408
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000338 | Grad Max: 0.169745
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005659 | Grad Max: 0.954903
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.002796
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001186 | Grad Max: 0.010472
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000197
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000222 | Grad Max: 0.002018
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000065
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000081 | Grad Max: 0.000428
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000478 | Grad Max: 0.001298
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002226 | Grad Max: 0.002226
[GRADIENT NORM TOTAL] 1.4045

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.427
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017212  0.49827883] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 550/1498 | B: 366/1490 | C: 182/1194
[LOSS Ex1] A: 0.66662 | B: 0.66884 | C: 0.66653
[LOGITS Ex2 A] Mean Abs: 1.597 | Max: 5.854
[LOSS Ex2] A: 0.21991 | B: 0.39792 | C: 0.32311
** [JOINT LOSS] ** : 0.980979
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003332 | Grad Max: 0.064914
  -> Layer: shared_layers.0.bias | Grad Mean: 0.212949 | Grad Max: 0.827534
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.008647
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009351 | Grad Max: 0.009351
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001315 | Grad Max: 0.169734
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024678 | Grad Max: 0.954115
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000255 | Grad Max: 0.007687
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011237 | Grad Max: 0.047972
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000456
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002404 | Grad Max: 0.005412
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000216
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000706 | Grad Max: 0.001834
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001089 | Grad Max: 0.002303
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018264 | Grad Max: 0.018264
[GRADIENT NORM TOTAL] 3.9248

[EPOCH SUMMARY] Train Loss: 0.9863

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9541 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9570 -> New: 0.9541)

############################## EPOCH 61/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.362
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50470597 0.49529406] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 522/1526 | B: 380/1668 | C: 274/1774
[LOSS Ex1] A: 0.67111 | B: 0.66813 | C: 0.66381
[LOGITS Ex2 A] Mean Abs: 1.591 | Max: 5.748
[LOSS Ex2] A: 0.20720 | B: 0.41035 | C: 0.34098
** [JOINT LOSS] ** : 0.987195
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003410 | Grad Max: 0.103932
  -> Layer: shared_layers.0.bias | Grad Mean: 0.084690 | Grad Max: 0.322226
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001955 | Grad Max: 0.006844
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002698 | Grad Max: 0.002698
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000700 | Grad Max: 0.115629
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012365 | Grad Max: 0.631008
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000136 | Grad Max: 0.004077
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005736 | Grad Max: 0.023896
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000323
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001320 | Grad Max: 0.003633
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000136
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000409 | Grad Max: 0.001215
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000774 | Grad Max: 0.001873
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011919 | Grad Max: 0.011919
[GRADIENT NORM TOTAL] 1.7879

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.251
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5234204 0.4765796] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 529/1519 | B: 380/1668 | C: 289/1759
[LOSS Ex1] A: 0.67063 | B: 0.66866 | C: 0.66351
[LOGITS Ex2 A] Mean Abs: 1.595 | Max: 5.997
[LOSS Ex2] A: 0.20792 | B: 0.42305 | C: 0.32380
** [JOINT LOSS] ** : 0.985856
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006030 | Grad Max: 0.143572
  -> Layer: shared_layers.0.bias | Grad Mean: 0.426551 | Grad Max: 1.998412
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001916 | Grad Max: 0.007274
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000620 | Grad Max: 0.000620
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002802 | Grad Max: 0.258407
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052280 | Grad Max: 1.436124
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000550 | Grad Max: 0.015260
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024260 | Grad Max: 0.104074
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000890
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005337 | Grad Max: 0.011090
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000389
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001597 | Grad Max: 0.003924
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002345 | Grad Max: 0.004931
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041468 | Grad Max: 0.041468
[GRADIENT NORM TOTAL] 8.3590

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.412
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59945875 0.40054125] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.031
[MASKS] A(Pass/Fail): 594/1454 | B: 407/1641 | C: 257/1791
[LOSS Ex1] A: 0.66785 | B: 0.66569 | C: 0.66558
[LOGITS Ex2 A] Mean Abs: 1.651 | Max: 5.911
[LOSS Ex2] A: 0.21277 | B: 0.41561 | C: 0.36073
** [JOINT LOSS] ** : 0.996078
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008454 | Grad Max: 0.200331
  -> Layer: shared_layers.0.bias | Grad Mean: 0.584588 | Grad Max: 2.582797
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002179 | Grad Max: 0.008589
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011922 | Grad Max: 0.011922
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003906 | Grad Max: 0.356968
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.073203 | Grad Max: 1.996109
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000785 | Grad Max: 0.023323
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034471 | Grad Max: 0.154390
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000107 | Grad Max: 0.001239
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007537 | Grad Max: 0.015832
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000505
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002258 | Grad Max: 0.005228
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003527 | Grad Max: 0.006425
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.060669 | Grad Max: 0.060669
[GRADIENT NORM TOTAL] 11.4713

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.459
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50030273 0.49969724] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 553/1495 | B: 370/1486 | C: 285/1763
[LOSS Ex1] A: 0.67141 | B: 0.66863 | C: 0.66124
[LOGITS Ex2 A] Mean Abs: 1.646 | Max: 5.442
[LOSS Ex2] A: 0.19524 | B: 0.39629 | C: 0.31459
** [JOINT LOSS] ** : 0.969136
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004748 | Grad Max: 0.123601
  -> Layer: shared_layers.0.bias | Grad Mean: 0.332837 | Grad Max: 1.502105
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.007556
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008100 | Grad Max: 0.008100
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.233401
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042324 | Grad Max: 1.315708
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000457 | Grad Max: 0.013564
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020198 | Grad Max: 0.089369
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000741
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004348 | Grad Max: 0.009093
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000330
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001305 | Grad Max: 0.003218
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002025 | Grad Max: 0.004415
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035625 | Grad Max: 0.035625
[GRADIENT NORM TOTAL] 6.6992

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.245
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5883854  0.41161466] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 545/1503 | B: 388/1660 | C: 248/1800
[LOSS Ex1] A: 0.66939 | B: 0.66793 | C: 0.66534
[LOGITS Ex2 A] Mean Abs: 1.599 | Max: 6.337
[LOSS Ex2] A: 0.20643 | B: 0.42129 | C: 0.33090
** [JOINT LOSS] ** : 0.987094
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003372 | Grad Max: 0.075008
  -> Layer: shared_layers.0.bias | Grad Mean: 0.243396 | Grad Max: 1.115784
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001977 | Grad Max: 0.007364
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000132 | Grad Max: 0.000132
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001619 | Grad Max: 0.121927
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030350 | Grad Max: 0.674740
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000328 | Grad Max: 0.009789
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014472 | Grad Max: 0.059059
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000602
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003150 | Grad Max: 0.007063
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000225
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000935 | Grad Max: 0.002164
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001386 | Grad Max: 0.002591
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024300 | Grad Max: 0.024300
[GRADIENT NORM TOTAL] 4.7002

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.324
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55503696 0.44496307] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.031
[MASKS] A(Pass/Fail): 455/1161 | B: 386/1662 | C: 256/1792
[LOSS Ex1] A: 0.66831 | B: 0.66847 | C: 0.66587
[LOGITS Ex2 A] Mean Abs: 1.621 | Max: 5.633
[LOSS Ex2] A: 0.20719 | B: 0.41813 | C: 0.31017
** [JOINT LOSS] ** : 0.979377
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006963 | Grad Max: 0.185997
  -> Layer: shared_layers.0.bias | Grad Mean: 0.426402 | Grad Max: 1.934838
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001984 | Grad Max: 0.007556
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002714 | Grad Max: 0.002714
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002784 | Grad Max: 0.239688
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052777 | Grad Max: 1.366287
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000578 | Grad Max: 0.015299
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025421 | Grad Max: 0.102793
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000954
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005572 | Grad Max: 0.012334
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000394
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001655 | Grad Max: 0.003871
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002551 | Grad Max: 0.004431
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043796 | Grad Max: 0.043796
[GRADIENT NORM TOTAL] 8.1694

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.461
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50599927 0.49400067] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 581/1467 | B: 411/1637 | C: 277/1771
[LOSS Ex1] A: 0.66848 | B: 0.66550 | C: 0.66317
[LOGITS Ex2 A] Mean Abs: 1.629 | Max: 6.915
[LOSS Ex2] A: 0.21290 | B: 0.39463 | C: 0.30231
** [JOINT LOSS] ** : 0.968995
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002588 | Grad Max: 0.066732
  -> Layer: shared_layers.0.bias | Grad Mean: 0.163984 | Grad Max: 0.863400
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.007547
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001604 | Grad Max: 0.001604
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001075 | Grad Max: 0.147098
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020027 | Grad Max: 0.840229
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000217 | Grad Max: 0.006943
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009487 | Grad Max: 0.043191
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000404
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002049 | Grad Max: 0.005072
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000188
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000607 | Grad Max: 0.001623
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000945 | Grad Max: 0.002301
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015277 | Grad Max: 0.015277
[GRADIENT NORM TOTAL] 3.3292

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.418
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50557816 0.49442187] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 566/1482 | B: 372/1484 | C: 273/1775
[LOSS Ex1] A: 0.66723 | B: 0.66845 | C: 0.66419
[LOGITS Ex2 A] Mean Abs: 1.656 | Max: 5.785
[LOSS Ex2] A: 0.21166 | B: 0.39260 | C: 0.30824
** [JOINT LOSS] ** : 0.970793
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007123 | Grad Max: 0.159653
  -> Layer: shared_layers.0.bias | Grad Mean: 0.316597 | Grad Max: 1.518245
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.008494
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010917 | Grad Max: 0.010917
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002240 | Grad Max: 0.218808
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041786 | Grad Max: 1.156738
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000458 | Grad Max: 0.012216
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019927 | Grad Max: 0.080652
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000777
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004397 | Grad Max: 0.009819
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000305
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001328 | Grad Max: 0.003092
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002139 | Grad Max: 0.004450
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036675 | Grad Max: 0.036675
[GRADIENT NORM TOTAL] 6.2014

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.430
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50170904 0.498291  ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 556/1492 | B: 393/1655 | C: 263/1785
[LOSS Ex1] A: 0.66632 | B: 0.66776 | C: 0.66313
[LOGITS Ex2 A] Mean Abs: 1.672 | Max: 6.341
[LOSS Ex2] A: 0.22774 | B: 0.41651 | C: 0.34280
** [JOINT LOSS] ** : 0.994750
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008461 | Grad Max: 0.235956
  -> Layer: shared_layers.0.bias | Grad Mean: 0.436706 | Grad Max: 2.080117
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.008593
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005887 | Grad Max: 0.005887
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003016 | Grad Max: 0.298752
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056378 | Grad Max: 1.628642
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000606 | Grad Max: 0.014412
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026661 | Grad Max: 0.101341
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000996
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005892 | Grad Max: 0.012602
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000428
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001762 | Grad Max: 0.004192
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002722 | Grad Max: 0.004819
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046405 | Grad Max: 0.046405
[GRADIENT NORM TOTAL] 8.5025

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.364
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5046973  0.49530262] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.031
[MASKS] A(Pass/Fail): 528/1520 | B: 392/1656 | C: 266/1782
[LOSS Ex1] A: 0.67085 | B: 0.66830 | C: 0.66530
[LOGITS Ex2 A] Mean Abs: 1.624 | Max: 6.068
[LOSS Ex2] A: 0.20739 | B: 0.40190 | C: 0.32156
** [JOINT LOSS] ** : 0.978434
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003661 | Grad Max: 0.089028
  -> Layer: shared_layers.0.bias | Grad Mean: 0.183158 | Grad Max: 0.864889
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001881 | Grad Max: 0.006260
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004116 | Grad Max: 0.004116
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001241 | Grad Max: 0.173727
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023264 | Grad Max: 0.951807
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000231 | Grad Max: 0.007417
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010194 | Grad Max: 0.045133
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000414
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002216 | Grad Max: 0.005353
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000181
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000662 | Grad Max: 0.001654
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000911 | Grad Max: 0.002579
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016683 | Grad Max: 0.016683
[GRADIENT NORM TOTAL] 3.8058

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.254
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5235718 0.4764282] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 539/1509 | B: 413/1635 | C: 260/1788
[LOSS Ex1] A: 0.67038 | B: 0.66533 | C: 0.66468
[LOGITS Ex2 A] Mean Abs: 1.541 | Max: 6.278
[LOSS Ex2] A: 0.21199 | B: 0.40255 | C: 0.31076
** [JOINT LOSS] ** : 0.975227
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004339 | Grad Max: 0.114428
  -> Layer: shared_layers.0.bias | Grad Mean: 0.305227 | Grad Max: 1.396050
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.007761
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005191 | Grad Max: 0.005191
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001986 | Grad Max: 0.202137
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037390 | Grad Max: 1.138028
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000414 | Grad Max: 0.011993
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018318 | Grad Max: 0.079839
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000702
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003993 | Grad Max: 0.008840
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000308
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001178 | Grad Max: 0.002934
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001732 | Grad Max: 0.003188
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029935 | Grad Max: 0.029935
[GRADIENT NORM TOTAL] 5.8765

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.414
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6006655 0.3993345] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 596/1452 | B: 378/1478 | C: 282/1766
[LOSS Ex1] A: 0.66757 | B: 0.66829 | C: 0.66360
[LOGITS Ex2 A] Mean Abs: 1.583 | Max: 5.829
[LOSS Ex2] A: 0.20567 | B: 0.40055 | C: 0.32285
** [JOINT LOSS] ** : 0.976180
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005610 | Grad Max: 0.132878
  -> Layer: shared_layers.0.bias | Grad Mean: 0.432371 | Grad Max: 1.945987
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002205 | Grad Max: 0.008866
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015609 | Grad Max: 0.015609
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002635 | Grad Max: 0.297135
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050247 | Grad Max: 1.658512
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000546 | Grad Max: 0.014312
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024216 | Grad Max: 0.095575
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000833
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005253 | Grad Max: 0.011180
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000406
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001551 | Grad Max: 0.003848
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002215 | Grad Max: 0.004273
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039348 | Grad Max: 0.039348
[GRADIENT NORM TOTAL] 8.0861

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.463
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003233  0.49967673] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 560/1488 | B: 398/1650 | C: 280/1768
[LOSS Ex1] A: 0.67117 | B: 0.66761 | C: 0.66393
[LOGITS Ex2 A] Mean Abs: 1.589 | Max: 5.582
[LOSS Ex2] A: 0.19942 | B: 0.41859 | C: 0.31202
** [JOINT LOSS] ** : 0.977578
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005204 | Grad Max: 0.155215
  -> Layer: shared_layers.0.bias | Grad Mean: 0.279352 | Grad Max: 1.231132
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001985 | Grad Max: 0.007383
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004907 | Grad Max: 0.004907
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001779 | Grad Max: 0.183455
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032635 | Grad Max: 0.996078
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000354 | Grad Max: 0.009035
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015535 | Grad Max: 0.065851
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000652
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003416 | Grad Max: 0.007667
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000237
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001018 | Grad Max: 0.002378
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001544 | Grad Max: 0.003015
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026198 | Grad Max: 0.026198
[GRADIENT NORM TOTAL] 5.1777

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.248
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58926284 0.4107372 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 552/1496 | B: 395/1653 | C: 180/1196
[LOSS Ex1] A: 0.66914 | B: 0.66816 | C: 0.66629
[LOGITS Ex2 A] Mean Abs: 1.635 | Max: 5.634
[LOSS Ex2] A: 0.21501 | B: 0.41096 | C: 0.33055
** [JOINT LOSS] ** : 0.986704
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003116 | Grad Max: 0.080792
  -> Layer: shared_layers.0.bias | Grad Mean: 0.231696 | Grad Max: 0.988680
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001921 | Grad Max: 0.007572
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000599 | Grad Max: 0.000599
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001561 | Grad Max: 0.156320
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029138 | Grad Max: 0.876193
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000312 | Grad Max: 0.009917
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013742 | Grad Max: 0.060381
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000521
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002971 | Grad Max: 0.006770
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000216
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000880 | Grad Max: 0.002217
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001241 | Grad Max: 0.002815
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022739 | Grad Max: 0.022739
[GRADIENT NORM TOTAL] 4.7420

[EPOCH SUMMARY] Train Loss: 0.9810

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9639 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 62/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.328
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55551356 0.4444864 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.031
[MASKS] A(Pass/Fail): 459/1157 | B: 418/1630 | C: 286/1762
[LOSS Ex1] A: 0.66806 | B: 0.66518 | C: 0.66121
[LOGITS Ex2 A] Mean Abs: 1.682 | Max: 5.555
[LOSS Ex2] A: 0.20190 | B: 0.39534 | C: 0.31858
** [JOINT LOSS] ** : 0.970089
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004672 | Grad Max: 0.141840
  -> Layer: shared_layers.0.bias | Grad Mean: 0.427139 | Grad Max: 1.902636
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.008201
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004249 | Grad Max: 0.004249
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002680 | Grad Max: 0.258729
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050545 | Grad Max: 1.428795
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000545 | Grad Max: 0.014782
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024303 | Grad Max: 0.099855
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000918
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005289 | Grad Max: 0.011766
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000361
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001580 | Grad Max: 0.003660
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002362 | Grad Max: 0.004921
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042158 | Grad Max: 0.042158
[GRADIENT NORM TOTAL] 8.3005

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.463
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5060262  0.49397376] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 587/1461 | B: 384/1472 | C: 272/1776
[LOSS Ex1] A: 0.66823 | B: 0.66815 | C: 0.66441
[LOGITS Ex2 A] Mean Abs: 1.662 | Max: 7.002
[LOSS Ex2] A: 0.20998 | B: 0.38252 | C: 0.31234
** [JOINT LOSS] ** : 0.968544
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002613 | Grad Max: 0.088022
  -> Layer: shared_layers.0.bias | Grad Mean: 0.208232 | Grad Max: 0.996737
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.007718
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005571 | Grad Max: 0.005571
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001341 | Grad Max: 0.128303
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024935 | Grad Max: 0.714305
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.009170
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012618 | Grad Max: 0.055017
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000542
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002726 | Grad Max: 0.006497
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000234
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000819 | Grad Max: 0.002073
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001192 | Grad Max: 0.002897
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021598 | Grad Max: 0.021598
[GRADIENT NORM TOTAL] 4.0893

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.420
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5055786  0.49442142] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.032
[MASKS] A(Pass/Fail): 569/1479 | B: 401/1647 | C: 277/1771
[LOSS Ex1] A: 0.66698 | B: 0.66747 | C: 0.66392
[LOGITS Ex2 A] Mean Abs: 1.622 | Max: 5.443
[LOSS Ex2] A: 0.20016 | B: 0.41234 | C: 0.31261
** [JOINT LOSS] ** : 0.974495
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003223 | Grad Max: 0.073760
  -> Layer: shared_layers.0.bias | Grad Mean: 0.239793 | Grad Max: 1.050572
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.009040
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015104 | Grad Max: 0.015104
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001571 | Grad Max: 0.194600
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029577 | Grad Max: 1.103725
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000292 | Grad Max: 0.008979
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012992 | Grad Max: 0.056972
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000555
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002821 | Grad Max: 0.006565
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000213
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000830 | Grad Max: 0.002025
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001208 | Grad Max: 0.002449
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020772 | Grad Max: 0.020772
[GRADIENT NORM TOTAL] 4.8856

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.431
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017111  0.49828896] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 561/1487 | B: 396/1652 | C: 257/1791
[LOSS Ex1] A: 0.66607 | B: 0.66802 | C: 0.66650
[LOGITS Ex2 A] Mean Abs: 1.587 | Max: 6.243
[LOSS Ex2] A: 0.21899 | B: 0.42542 | C: 0.29740
** [JOINT LOSS] ** : 0.980797
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005431 | Grad Max: 0.135799
  -> Layer: shared_layers.0.bias | Grad Mean: 0.411576 | Grad Max: 1.835365
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.008376
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010651 | Grad Max: 0.010651
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002631 | Grad Max: 0.274307
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050083 | Grad Max: 1.497591
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.015730
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023674 | Grad Max: 0.105641
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000806
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005110 | Grad Max: 0.011215
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000413
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001518 | Grad Max: 0.004025
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002345 | Grad Max: 0.004035
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040229 | Grad Max: 0.040229
[GRADIENT NORM TOTAL] 8.0445

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.366
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5046788  0.49532127] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.031
[MASKS] A(Pass/Fail): 529/1519 | B: 423/1625 | C: 246/1802
[LOSS Ex1] A: 0.67064 | B: 0.66504 | C: 0.66698
[LOGITS Ex2 A] Mean Abs: 1.588 | Max: 5.931
[LOSS Ex2] A: 0.20606 | B: 0.40972 | C: 0.32926
** [JOINT LOSS] ** : 0.982566
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004801 | Grad Max: 0.114700
  -> Layer: shared_layers.0.bias | Grad Mean: 0.266806 | Grad Max: 1.183149
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001991 | Grad Max: 0.007138
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005608 | Grad Max: 0.005608
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001800 | Grad Max: 0.230934
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033563 | Grad Max: 1.258522
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000355 | Grad Max: 0.010695
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015644 | Grad Max: 0.063422
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000637
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003433 | Grad Max: 0.007763
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000273
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001017 | Grad Max: 0.002672
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001520 | Grad Max: 0.002833
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025975 | Grad Max: 0.025975
[GRADIENT NORM TOTAL] 5.3675

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.257
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52372235 0.47627765] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.030
[MASKS] A(Pass/Fail): 540/1508 | B: 384/1472 | C: 285/1763
[LOSS Ex1] A: 0.67018 | B: 0.66801 | C: 0.66219
[LOGITS Ex2 A] Mean Abs: 1.594 | Max: 6.006
[LOSS Ex2] A: 0.21074 | B: 0.38570 | C: 0.35279
** [JOINT LOSS] ** : 0.983204
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005134 | Grad Max: 0.141973
  -> Layer: shared_layers.0.bias | Grad Mean: 0.225472 | Grad Max: 1.005183
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001955 | Grad Max: 0.007210
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003135 | Grad Max: 0.003135
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001579 | Grad Max: 0.140153
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029241 | Grad Max: 0.791694
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.008830
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013394 | Grad Max: 0.053441
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000631
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002976 | Grad Max: 0.007075
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000236
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000891 | Grad Max: 0.002281
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001352 | Grad Max: 0.002928
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023235 | Grad Max: 0.023235
[GRADIENT NORM TOTAL] 4.4129

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.416
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6021155  0.39788446] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 598/1450 | B: 403/1645 | C: 296/1752
[LOSS Ex1] A: 0.66735 | B: 0.66734 | C: 0.66251
[LOGITS Ex2 A] Mean Abs: 1.647 | Max: 6.281
[LOSS Ex2] A: 0.21212 | B: 0.41280 | C: 0.31965
** [JOINT LOSS] ** : 0.980588
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006336 | Grad Max: 0.175567
  -> Layer: shared_layers.0.bias | Grad Mean: 0.329693 | Grad Max: 1.466299
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002089 | Grad Max: 0.007683
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002187 | Grad Max: 0.002187
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.153442
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041590 | Grad Max: 0.850766
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000435 | Grad Max: 0.010272
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019142 | Grad Max: 0.070866
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000700
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004246 | Grad Max: 0.008985
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000327
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001264 | Grad Max: 0.003140
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001821 | Grad Max: 0.003672
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032122 | Grad Max: 0.032122
[GRADIENT NORM TOTAL] 6.2304

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.465
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002693  0.49973068] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 564/1484 | B: 398/1650 | C: 243/1805
[LOSS Ex1] A: 0.67097 | B: 0.66789 | C: 0.66578
[LOGITS Ex2 A] Mean Abs: 1.646 | Max: 5.469
[LOSS Ex2] A: 0.18845 | B: 0.40365 | C: 0.33101
** [JOINT LOSS] ** : 0.975914
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002351 | Grad Max: 0.046485
  -> Layer: shared_layers.0.bias | Grad Mean: 0.095490 | Grad Max: 0.418363
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001897 | Grad Max: 0.007164
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003838 | Grad Max: 0.003838
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000732 | Grad Max: 0.089209
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013058 | Grad Max: 0.491540
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000119 | Grad Max: 0.006104
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005126 | Grad Max: 0.035551
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000252
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001029 | Grad Max: 0.003357
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000098
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000299 | Grad Max: 0.000858
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000421 | Grad Max: 0.001522
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007249 | Grad Max: 0.007249
[GRADIENT NORM TOTAL] 2.0628

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.251
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.590055   0.40994498] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.032
[MASKS] A(Pass/Fail): 555/1493 | B: 426/1622 | C: 275/1773
[LOSS Ex1] A: 0.66892 | B: 0.66490 | C: 0.66501
[LOGITS Ex2 A] Mean Abs: 1.588 | Max: 6.054
[LOSS Ex2] A: 0.20951 | B: 0.41086 | C: 0.31700
** [JOINT LOSS] ** : 0.978735
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004677 | Grad Max: 0.108538
  -> Layer: shared_layers.0.bias | Grad Mean: 0.352795 | Grad Max: 1.586926
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.007783
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006131 | Grad Max: 0.006131
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002263 | Grad Max: 0.250992
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042974 | Grad Max: 1.422596
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.011867
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020414 | Grad Max: 0.083843
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000771
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004432 | Grad Max: 0.009694
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000349
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001316 | Grad Max: 0.003503
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001950 | Grad Max: 0.003593
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033807 | Grad Max: 0.033807
[GRADIENT NORM TOTAL] 6.8619

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.331
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5559937  0.44400632] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 460/1156 | B: 388/1468 | C: 291/1757
[LOSS Ex1] A: 0.66784 | B: 0.66788 | C: 0.66174
[LOGITS Ex2 A] Mean Abs: 1.609 | Max: 5.937
[LOSS Ex2] A: 0.20639 | B: 0.40408 | C: 0.33863
** [JOINT LOSS] ** : 0.982182
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007572 | Grad Max: 0.161772
  -> Layer: shared_layers.0.bias | Grad Mean: 0.482083 | Grad Max: 2.178672
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.008099
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000468 | Grad Max: 0.000468
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003202 | Grad Max: 0.316595
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059879 | Grad Max: 1.787220
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000656 | Grad Max: 0.017092
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029180 | Grad Max: 0.115420
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001040
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006434 | Grad Max: 0.013568
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000435
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001945 | Grad Max: 0.004509
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003093 | Grad Max: 0.005733
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052730 | Grad Max: 0.052730
[GRADIENT NORM TOTAL] 9.3192

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.465
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50599146 0.49400854] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 589/1459 | B: 404/1644 | C: 260/1788
[LOSS Ex1] A: 0.66801 | B: 0.66720 | C: 0.66437
[LOGITS Ex2 A] Mean Abs: 1.628 | Max: 6.907
[LOSS Ex2] A: 0.19965 | B: 0.42124 | C: 0.32714
** [JOINT LOSS] ** : 0.982538
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003895 | Grad Max: 0.101530
  -> Layer: shared_layers.0.bias | Grad Mean: 0.265214 | Grad Max: 1.253735
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002030 | Grad Max: 0.007743
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004519 | Grad Max: 0.004519
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001775 | Grad Max: 0.288979
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033530 | Grad Max: 1.624408
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000369 | Grad Max: 0.010775
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016402 | Grad Max: 0.071299
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000643
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003586 | Grad Max: 0.008449
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000263
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001077 | Grad Max: 0.002516
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001685 | Grad Max: 0.002886
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028843 | Grad Max: 0.028843
[GRADIENT NORM TOTAL] 5.3970

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.421
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056508  0.49434918] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 572/1476 | B: 401/1647 | C: 272/1776
[LOSS Ex1] A: 0.66675 | B: 0.66775 | C: 0.66278
[LOGITS Ex2 A] Mean Abs: 1.645 | Max: 6.275
[LOSS Ex2] A: 0.20762 | B: 0.40491 | C: 0.31260
** [JOINT LOSS] ** : 0.974140
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005371 | Grad Max: 0.159175
  -> Layer: shared_layers.0.bias | Grad Mean: 0.289306 | Grad Max: 1.178122
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.007985
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001520 | Grad Max: 0.001520
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001851 | Grad Max: 0.219745
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034604 | Grad Max: 1.195041
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000351 | Grad Max: 0.009959
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015490 | Grad Max: 0.065507
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000546
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003420 | Grad Max: 0.007350
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000239
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001028 | Grad Max: 0.002294
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001592 | Grad Max: 0.003608
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027463 | Grad Max: 0.027463
[GRADIENT NORM TOTAL] 5.5117

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.432
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016546  0.49834538] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 565/1483 | B: 425/1623 | C: 277/1771
[LOSS Ex1] A: 0.66584 | B: 0.66476 | C: 0.66209
[LOGITS Ex2 A] Mean Abs: 1.654 | Max: 6.035
[LOSS Ex2] A: 0.22442 | B: 0.41230 | C: 0.34424
** [JOINT LOSS] ** : 0.991219
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008100 | Grad Max: 0.241395
  -> Layer: shared_layers.0.bias | Grad Mean: 0.492087 | Grad Max: 2.066132
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.008524
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003239 | Grad Max: 0.003239
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003219 | Grad Max: 0.324119
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059797 | Grad Max: 1.739436
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000650 | Grad Max: 0.016825
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028740 | Grad Max: 0.110389
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001094
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006357 | Grad Max: 0.013876
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000468
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001915 | Grad Max: 0.004521
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002949 | Grad Max: 0.005129
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050887 | Grad Max: 0.050887
[GRADIENT NORM TOTAL] 9.3653

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.366
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50463057 0.49536943] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.526 | Std: 0.031
[MASKS] A(Pass/Fail): 531/1517 | B: 389/1467 | C: 191/1185
[LOSS Ex1] A: 0.67043 | B: 0.66774 | C: 0.66352
[LOGITS Ex2 A] Mean Abs: 1.628 | Max: 5.406
[LOSS Ex2] A: 0.20236 | B: 0.39211 | C: 0.28432
** [JOINT LOSS] ** : 0.960164
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005802 | Grad Max: 0.132995
  -> Layer: shared_layers.0.bias | Grad Mean: 0.288461 | Grad Max: 1.324068
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001924 | Grad Max: 0.006751
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000855 | Grad Max: 0.000855
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001904 | Grad Max: 0.227252
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035407 | Grad Max: 1.255945
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000374 | Grad Max: 0.009784
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016560 | Grad Max: 0.063309
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000626
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003681 | Grad Max: 0.007828
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000273
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001125 | Grad Max: 0.002677
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001657 | Grad Max: 0.004251
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030224 | Grad Max: 0.030224
[GRADIENT NORM TOTAL] 5.6014

[EPOCH SUMMARY] Train Loss: 0.9775

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9523 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9541 -> New: 0.9523)

############################## EPOCH 63/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.261
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5238829 0.4761171] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.526 | Std: 0.031
[MASKS] A(Pass/Fail): 544/1504 | B: 404/1644 | C: 252/1796
[LOSS Ex1] A: 0.66998 | B: 0.66707 | C: 0.66640
[LOGITS Ex2 A] Mean Abs: 1.547 | Max: 6.731
[LOSS Ex2] A: 0.20547 | B: 0.41405 | C: 0.35027
** [JOINT LOSS] ** : 0.991082
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003418 | Grad Max: 0.072407
  -> Layer: shared_layers.0.bias | Grad Mean: 0.180759 | Grad Max: 0.832627
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001887 | Grad Max: 0.007030
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001065 | Grad Max: 0.001065
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001298 | Grad Max: 0.154591
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024352 | Grad Max: 0.878496
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000254 | Grad Max: 0.008596
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011230 | Grad Max: 0.053872
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000466
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002463 | Grad Max: 0.005837
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000199
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000750 | Grad Max: 0.001943
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001259 | Grad Max: 0.002442
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020930 | Grad Max: 0.020930
[GRADIENT NORM TOTAL] 3.7487

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.418
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6038275  0.39617252] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.032
[MASKS] A(Pass/Fail): 602/1446 | B: 405/1643 | C: 289/1759
[LOSS Ex1] A: 0.66711 | B: 0.66762 | C: 0.66182
[LOGITS Ex2 A] Mean Abs: 1.586 | Max: 6.026
[LOSS Ex2] A: 0.19346 | B: 0.41610 | C: 0.31342
** [JOINT LOSS] ** : 0.973178
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004490 | Grad Max: 0.099481
  -> Layer: shared_layers.0.bias | Grad Mean: 0.266862 | Grad Max: 1.162144
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.008394
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009012 | Grad Max: 0.009012
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001773 | Grad Max: 0.170255
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033556 | Grad Max: 0.969582
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.009948
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015854 | Grad Max: 0.064321
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000676
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003469 | Grad Max: 0.008538
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000270
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001033 | Grad Max: 0.002625
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001573 | Grad Max: 0.003452
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026652 | Grad Max: 0.026652
[GRADIENT NORM TOTAL] 5.2216

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.467
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50024956 0.49975044] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.032
[MASKS] A(Pass/Fail): 570/1478 | B: 428/1620 | C: 269/1779
[LOSS Ex1] A: 0.67075 | B: 0.66462 | C: 0.66335
[LOGITS Ex2 A] Mean Abs: 1.613 | Max: 5.565
[LOSS Ex2] A: 0.19760 | B: 0.38829 | C: 0.30537
** [JOINT LOSS] ** : 0.963332
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.060657
  -> Layer: shared_layers.0.bias | Grad Mean: 0.040540 | Grad Max: 0.254459
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.007306
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004231 | Grad Max: 0.004231
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000385 | Grad Max: 0.125370
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006357 | Grad Max: 0.713282
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.003062
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001904 | Grad Max: 0.017588
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000173
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000429 | Grad Max: 0.001890
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000073
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000130 | Grad Max: 0.000568
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000545 | Grad Max: 0.001415
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003088 | Grad Max: 0.003088
[GRADIENT NORM TOTAL] 1.1932

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.255
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5907908 0.4092092] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.032
[MASKS] A(Pass/Fail): 559/1489 | B: 389/1467 | C: 268/1780
[LOSS Ex1] A: 0.66869 | B: 0.66760 | C: 0.66292
[LOGITS Ex2 A] Mean Abs: 1.640 | Max: 5.753
[LOSS Ex2] A: 0.21989 | B: 0.38019 | C: 0.33020
** [JOINT LOSS] ** : 0.976496
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006986 | Grad Max: 0.146043
  -> Layer: shared_layers.0.bias | Grad Mean: 0.370186 | Grad Max: 1.603212
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.007815
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005536 | Grad Max: 0.005536
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002543 | Grad Max: 0.227916
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047356 | Grad Max: 1.268313
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000496 | Grad Max: 0.013909
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021976 | Grad Max: 0.095281
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000803
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004853 | Grad Max: 0.010362
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000351
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001473 | Grad Max: 0.003389
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002325 | Grad Max: 0.004721
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040546 | Grad Max: 0.040546
[GRADIENT NORM TOTAL] 7.2348

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.335
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5564338  0.44356617] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 461/1155 | B: 405/1643 | C: 282/1766
[LOSS Ex1] A: 0.66760 | B: 0.66692 | C: 0.66322
[LOGITS Ex2 A] Mean Abs: 1.683 | Max: 5.993
[LOSS Ex2] A: 0.19697 | B: 0.42657 | C: 0.32974
** [JOINT LOSS] ** : 0.983673
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005347 | Grad Max: 0.112464
  -> Layer: shared_layers.0.bias | Grad Mean: 0.304243 | Grad Max: 1.296349
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.007082
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006336 | Grad Max: 0.006336
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.173265
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038065 | Grad Max: 0.968148
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.011808
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017743 | Grad Max: 0.073402
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000743
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003908 | Grad Max: 0.008754
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000293
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001170 | Grad Max: 0.002855
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001632 | Grad Max: 0.003127
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029754 | Grad Max: 0.029754
[GRADIENT NORM TOTAL] 5.7971

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.468
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5060019 0.4939981] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 594/1454 | B: 409/1639 | C: 270/1778
[LOSS Ex1] A: 0.66776 | B: 0.66747 | C: 0.66308
[LOGITS Ex2 A] Mean Abs: 1.624 | Max: 6.099
[LOSS Ex2] A: 0.20346 | B: 0.40577 | C: 0.33288
** [JOINT LOSS] ** : 0.980143
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001465 | Grad Max: 0.036026
  -> Layer: shared_layers.0.bias | Grad Mean: 0.029912 | Grad Max: 0.163630
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.007789
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003813 | Grad Max: 0.003813
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000353 | Grad Max: 0.041874
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005754 | Grad Max: 0.235753
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.002266
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001511 | Grad Max: 0.012922
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000176
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000288 | Grad Max: 0.001979
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000067
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000086 | Grad Max: 0.000607
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000314 | Grad Max: 0.001020
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002116 | Grad Max: 0.002116
[GRADIENT NORM TOTAL] 0.8548

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.421
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056335  0.49436656] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 572/1476 | B: 431/1617 | C: 285/1763
[LOSS Ex1] A: 0.66649 | B: 0.66444 | C: 0.66175
[LOGITS Ex2 A] Mean Abs: 1.615 | Max: 5.848
[LOSS Ex2] A: 0.19520 | B: 0.39315 | C: 0.30739
** [JOINT LOSS] ** : 0.962806
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.043943
  -> Layer: shared_layers.0.bias | Grad Mean: 0.143031 | Grad Max: 0.593507
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.008656
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011535 | Grad Max: 0.011535
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000922 | Grad Max: 0.097916
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017139 | Grad Max: 0.555872
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.006666
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008063 | Grad Max: 0.036802
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000370
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001742 | Grad Max: 0.004366
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000157
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000511 | Grad Max: 0.001622
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000729 | Grad Max: 0.002212
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012516 | Grad Max: 0.012516
[GRADIENT NORM TOTAL] 2.7759

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.433
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016708  0.49832922] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 572/1476 | B: 391/1465 | C: 265/1783
[LOSS Ex1] A: 0.66555 | B: 0.66742 | C: 0.66284
[LOGITS Ex2 A] Mean Abs: 1.626 | Max: 6.168
[LOSS Ex2] A: 0.21691 | B: 0.38754 | C: 0.31643
** [JOINT LOSS] ** : 0.972233
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.044037
  -> Layer: shared_layers.0.bias | Grad Mean: 0.116510 | Grad Max: 0.670056
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.008573
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003966 | Grad Max: 0.003966
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000763 | Grad Max: 0.108260
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013418 | Grad Max: 0.600465
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.004768
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005464 | Grad Max: 0.031309
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000254
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001113 | Grad Max: 0.003256
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000111
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000308 | Grad Max: 0.000999
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000414 | Grad Max: 0.001431
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006280 | Grad Max: 0.006280
[GRADIENT NORM TOTAL] 2.3209

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.367
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50462526 0.49537468] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.032
[MASKS] A(Pass/Fail): 536/1512 | B: 406/1642 | C: 283/1765
[LOSS Ex1] A: 0.67016 | B: 0.66673 | C: 0.66241
[LOGITS Ex2 A] Mean Abs: 1.631 | Max: 5.696
[LOSS Ex2] A: 0.20290 | B: 0.42404 | C: 0.30277
** [JOINT LOSS] ** : 0.976337
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003973 | Grad Max: 0.111358
  -> Layer: shared_layers.0.bias | Grad Mean: 0.327291 | Grad Max: 1.433501
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001960 | Grad Max: 0.006913
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000399 | Grad Max: 0.000399
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.185921
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038589 | Grad Max: 1.019331
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000425 | Grad Max: 0.012825
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019069 | Grad Max: 0.083233
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000694
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004114 | Grad Max: 0.009306
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000305
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001228 | Grad Max: 0.002853
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001749 | Grad Max: 0.003746
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032274 | Grad Max: 0.032274
[GRADIENT NORM TOTAL] 6.3311

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.266
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5240272 0.4759728] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.031
[MASKS] A(Pass/Fail): 548/1500 | B: 410/1638 | C: 276/1772
[LOSS Ex1] A: 0.66971 | B: 0.66727 | C: 0.66204
[LOGITS Ex2 A] Mean Abs: 1.609 | Max: 5.898
[LOSS Ex2] A: 0.21544 | B: 0.40967 | C: 0.32744
** [JOINT LOSS] ** : 0.983860
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005371 | Grad Max: 0.133389
  -> Layer: shared_layers.0.bias | Grad Mean: 0.372008 | Grad Max: 1.627683
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001953 | Grad Max: 0.007013
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001831 | Grad Max: 0.001831
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002376 | Grad Max: 0.234807
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044740 | Grad Max: 1.245124
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000492 | Grad Max: 0.015950
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022040 | Grad Max: 0.102502
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000836
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004799 | Grad Max: 0.010336
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000346
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001430 | Grad Max: 0.003310
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002110 | Grad Max: 0.003891
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037262 | Grad Max: 0.037262
[GRADIENT NORM TOTAL] 7.1978

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.421
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6060464 0.3939536] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.033
[MASKS] A(Pass/Fail): 606/1442 | B: 436/1612 | C: 263/1785
[LOSS Ex1] A: 0.66679 | B: 0.66424 | C: 0.66338
[LOGITS Ex2 A] Mean Abs: 1.633 | Max: 6.175
[LOSS Ex2] A: 0.19230 | B: 0.39345 | C: 0.31856
** [JOINT LOSS] ** : 0.966239
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001984 | Grad Max: 0.050930
  -> Layer: shared_layers.0.bias | Grad Mean: 0.108426 | Grad Max: 0.397108
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.008506
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010588 | Grad Max: 0.010588
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000780 | Grad Max: 0.118868
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013883 | Grad Max: 0.670173
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000125 | Grad Max: 0.005834
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005525 | Grad Max: 0.029783
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000272
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001168 | Grad Max: 0.003049
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000119
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000359 | Grad Max: 0.001154
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000571 | Grad Max: 0.001944
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010397 | Grad Max: 0.010397
[GRADIENT NORM TOTAL] 2.3128

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.471
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002565  0.49974355] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 573/1475 | B: 395/1461 | C: 254/1794
[LOSS Ex1] A: 0.67044 | B: 0.66723 | C: 0.66510
[LOGITS Ex2 A] Mean Abs: 1.598 | Max: 5.641
[LOSS Ex2] A: 0.20093 | B: 0.40340 | C: 0.32103
** [JOINT LOSS] ** : 0.976048
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008144 | Grad Max: 0.244861
  -> Layer: shared_layers.0.bias | Grad Mean: 0.421342 | Grad Max: 1.709963
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001961 | Grad Max: 0.007373
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008018 | Grad Max: 0.008018
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002821 | Grad Max: 0.301225
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052535 | Grad Max: 1.604407
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000541 | Grad Max: 0.014866
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023948 | Grad Max: 0.097209
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000841
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005338 | Grad Max: 0.011208
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000387
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001598 | Grad Max: 0.003825
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002404 | Grad Max: 0.004416
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041473 | Grad Max: 0.041473
[GRADIENT NORM TOTAL] 8.0861

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.260
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5917732  0.40822676] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.032
[MASKS] A(Pass/Fail): 561/1487 | B: 407/1641 | C: 253/1795
[LOSS Ex1] A: 0.66835 | B: 0.66655 | C: 0.66556
[LOGITS Ex2 A] Mean Abs: 1.581 | Max: 5.668
[LOSS Ex2] A: 0.23011 | B: 0.44357 | C: 0.34152
** [JOINT LOSS] ** : 1.005217
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010322 | Grad Max: 0.243568
  -> Layer: shared_layers.0.bias | Grad Mean: 0.638289 | Grad Max: 2.608570
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.007597
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006058 | Grad Max: 0.006058
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004217 | Grad Max: 0.408090
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.079028 | Grad Max: 2.224020
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000830 | Grad Max: 0.021656
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036930 | Grad Max: 0.144461
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000113 | Grad Max: 0.001246
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008160 | Grad Max: 0.017399
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000589
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002443 | Grad Max: 0.005705
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003565 | Grad Max: 0.007066
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.063075 | Grad Max: 0.063075
[GRADIENT NORM TOTAL] 12.2394

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.340
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55704296 0.44295704] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.033
[MASKS] A(Pass/Fail): 464/1152 | B: 415/1633 | C: 198/1178
[LOSS Ex1] A: 0.66726 | B: 0.66709 | C: 0.66092
[LOGITS Ex2 A] Mean Abs: 1.627 | Max: 5.741
[LOSS Ex2] A: 0.20068 | B: 0.42338 | C: 0.32155
** [JOINT LOSS] ** : 0.980295
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006658 | Grad Max: 0.186397
  -> Layer: shared_layers.0.bias | Grad Mean: 0.432832 | Grad Max: 1.780846
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.007517
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000818 | Grad Max: 0.000818
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002885 | Grad Max: 0.249868
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053714 | Grad Max: 1.398377
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000571 | Grad Max: 0.014100
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025476 | Grad Max: 0.099661
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000927
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005619 | Grad Max: 0.012027
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000420
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001689 | Grad Max: 0.004241
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002579 | Grad Max: 0.005315
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044811 | Grad Max: 0.044811
[GRADIENT NORM TOTAL] 8.2996

[EPOCH SUMMARY] Train Loss: 0.9779

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9504 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9523 -> New: 0.9504)

############################## EPOCH 64/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.472
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059829 0.4940171] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.033
[MASKS] A(Pass/Fail): 594/1454 | B: 439/1609 | C: 279/1769
[LOSS Ex1] A: 0.66741 | B: 0.66405 | C: 0.66158
[LOGITS Ex2 A] Mean Abs: 1.653 | Max: 6.360
[LOSS Ex2] A: 0.19409 | B: 0.38771 | C: 0.31639
** [JOINT LOSS] ** : 0.963742
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001373 | Grad Max: 0.052159
  -> Layer: shared_layers.0.bias | Grad Mean: 0.090091 | Grad Max: 0.464666
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.007852
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002227 | Grad Max: 0.002227
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000611 | Grad Max: 0.071857
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010925 | Grad Max: 0.401318
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000093 | Grad Max: 0.004764
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003983 | Grad Max: 0.024810
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000235
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000844 | Grad Max: 0.002688
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000125
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000256 | Grad Max: 0.001072
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000463 | Grad Max: 0.001707
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007109 | Grad Max: 0.007109
[GRADIENT NORM TOTAL] 1.8470

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.423
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056633  0.49433675] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.033
[MASKS] A(Pass/Fail): 575/1473 | B: 397/1459 | C: 265/1783
[LOSS Ex1] A: 0.66613 | B: 0.66704 | C: 0.66383
[LOGITS Ex2 A] Mean Abs: 1.665 | Max: 5.686
[LOSS Ex2] A: 0.21539 | B: 0.38042 | C: 0.32377
** [JOINT LOSS] ** : 0.972196
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006821 | Grad Max: 0.173036
  -> Layer: shared_layers.0.bias | Grad Mean: 0.294651 | Grad Max: 1.238211
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.008104
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007306 | Grad Max: 0.007306
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001975 | Grad Max: 0.187176
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037049 | Grad Max: 0.943174
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000393 | Grad Max: 0.011077
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017315 | Grad Max: 0.067822
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000712
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003821 | Grad Max: 0.008771
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000271
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001146 | Grad Max: 0.002693
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001753 | Grad Max: 0.003767
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030163 | Grad Max: 0.030163
[GRADIENT NORM TOTAL] 5.6036

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.435
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50161874 0.49838126] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.033
[MASKS] A(Pass/Fail): 576/1472 | B: 410/1638 | C: 270/1778
[LOSS Ex1] A: 0.66518 | B: 0.66636 | C: 0.66345
[LOGITS Ex2 A] Mean Abs: 1.650 | Max: 6.144
[LOSS Ex2] A: 0.22020 | B: 0.40915 | C: 0.32496
** [JOINT LOSS] ** : 0.983104
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005247 | Grad Max: 0.205912
  -> Layer: shared_layers.0.bias | Grad Mean: 0.111106 | Grad Max: 0.477594
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.008164
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002884 | Grad Max: 0.002884
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000993 | Grad Max: 0.090218
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016902 | Grad Max: 0.490920
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000171 | Grad Max: 0.004255
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007165 | Grad Max: 0.027924
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000387
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001690 | Grad Max: 0.004398
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000162
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000517 | Grad Max: 0.001410
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000767 | Grad Max: 0.001961
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013536 | Grad Max: 0.013536
[GRADIENT NORM TOTAL] 2.4204

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.368
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5045759  0.49542406] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.032
[MASKS] A(Pass/Fail): 537/1511 | B: 419/1629 | C: 289/1759
[LOSS Ex1] A: 0.66983 | B: 0.66689 | C: 0.66032
[LOGITS Ex2 A] Mean Abs: 1.586 | Max: 5.867
[LOSS Ex2] A: 0.19636 | B: 0.41305 | C: 0.31849
** [JOINT LOSS] ** : 0.974979
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004315 | Grad Max: 0.126521
  -> Layer: shared_layers.0.bias | Grad Mean: 0.360309 | Grad Max: 1.601705
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.006573
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003510 | Grad Max: 0.003510
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002362 | Grad Max: 0.233227
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044870 | Grad Max: 1.325794
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000478 | Grad Max: 0.013584
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021560 | Grad Max: 0.090611
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000743
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004668 | Grad Max: 0.009927
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000326
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001388 | Grad Max: 0.003341
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002155 | Grad Max: 0.004013
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036989 | Grad Max: 0.036989
[GRADIENT NORM TOTAL] 7.0441

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.272
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5242624  0.47573757] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.032
[MASKS] A(Pass/Fail): 551/1497 | B: 439/1609 | C: 276/1772
[LOSS Ex1] A: 0.66939 | B: 0.66384 | C: 0.66235
[LOGITS Ex2 A] Mean Abs: 1.560 | Max: 6.873
[LOSS Ex2] A: 0.20839 | B: 0.40562 | C: 0.32955
** [JOINT LOSS] ** : 0.979714
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006563 | Grad Max: 0.153790
  -> Layer: shared_layers.0.bias | Grad Mean: 0.404300 | Grad Max: 1.715205
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.006794
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003836 | Grad Max: 0.003836
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002674 | Grad Max: 0.264209
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050376 | Grad Max: 1.489743
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000538 | Grad Max: 0.014805
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024021 | Grad Max: 0.097063
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000804
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005275 | Grad Max: 0.010630
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000364
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001575 | Grad Max: 0.003699
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002457 | Grad Max: 0.004625
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041224 | Grad Max: 0.041224
[GRADIENT NORM TOTAL] 7.7638

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.424
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6085036 0.3914964] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.033
[MASKS] A(Pass/Fail): 611/1437 | B: 400/1456 | C: 261/1787
[LOSS Ex1] A: 0.66642 | B: 0.66684 | C: 0.66343
[LOGITS Ex2 A] Mean Abs: 1.634 | Max: 5.562
[LOSS Ex2] A: 0.18856 | B: 0.38207 | C: 0.33015
** [JOINT LOSS] ** : 0.965825
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002436 | Grad Max: 0.061099
  -> Layer: shared_layers.0.bias | Grad Mean: 0.114295 | Grad Max: 0.502770
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.007975
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007418 | Grad Max: 0.007418
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000796 | Grad Max: 0.102448
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014551 | Grad Max: 0.560836
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000151 | Grad Max: 0.005070
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006639 | Grad Max: 0.033065
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000343
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001476 | Grad Max: 0.003924
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000142
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000426 | Grad Max: 0.001355
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000499 | Grad Max: 0.001640
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009426 | Grad Max: 0.009426
[GRADIENT NORM TOTAL] 2.2332

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.473
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5001995 0.4998005] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.033
[MASKS] A(Pass/Fail): 578/1470 | B: 413/1635 | C: 270/1778
[LOSS Ex1] A: 0.67011 | B: 0.66616 | C: 0.66310
[LOGITS Ex2 A] Mean Abs: 1.692 | Max: 5.499
[LOSS Ex2] A: 0.19150 | B: 0.42288 | C: 0.33940
** [JOINT LOSS] ** : 0.984386
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004854 | Grad Max: 0.147905
  -> Layer: shared_layers.0.bias | Grad Mean: 0.449796 | Grad Max: 1.938153
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001997 | Grad Max: 0.007489
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005675 | Grad Max: 0.005675
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002872 | Grad Max: 0.262422
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054171 | Grad Max: 1.455778
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000582 | Grad Max: 0.017003
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026302 | Grad Max: 0.117547
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000976
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005639 | Grad Max: 0.012200
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000398
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001657 | Grad Max: 0.004016
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002329 | Grad Max: 0.004585
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041558 | Grad Max: 0.041558
[GRADIENT NORM TOTAL] 8.8447

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.266
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5928031  0.40719688] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.033
[MASKS] A(Pass/Fail): 566/1482 | B: 423/1625 | C: 280/1768
[LOSS Ex1] A: 0.66798 | B: 0.66670 | C: 0.66142
[LOGITS Ex2 A] Mean Abs: 1.694 | Max: 6.106
[LOSS Ex2] A: 0.22705 | B: 0.43547 | C: 0.33308
** [JOINT LOSS] ** : 0.997232
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007997 | Grad Max: 0.227594
  -> Layer: shared_layers.0.bias | Grad Mean: 0.657710 | Grad Max: 2.895786
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002067 | Grad Max: 0.007498
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002468 | Grad Max: 0.002468
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004181 | Grad Max: 0.387183
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.079253 | Grad Max: 2.193588
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000834 | Grad Max: 0.022446
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037823 | Grad Max: 0.159007
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000113 | Grad Max: 0.001289
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008247 | Grad Max: 0.017648
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000597
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002456 | Grad Max: 0.006047
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003776 | Grad Max: 0.006590
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.064543 | Grad Max: 0.064543
[GRADIENT NORM TOTAL] 12.8623

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.55767846 0.4423216 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.033
[MASKS] A(Pass/Fail): 468/1148 | B: 441/1607 | C: 293/1755
[LOSS Ex1] A: 0.66690 | B: 0.66365 | C: 0.66150
[LOGITS Ex2 A] Mean Abs: 1.726 | Max: 6.572
[LOSS Ex2] A: 0.18932 | B: 0.40159 | C: 0.30712
** [JOINT LOSS] ** : 0.963360
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005197 | Grad Max: 0.168997
  -> Layer: shared_layers.0.bias | Grad Mean: 0.479700 | Grad Max: 2.174543
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.008106
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002035 | Grad Max: 0.002035
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002985 | Grad Max: 0.295514
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056476 | Grad Max: 1.629306
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000592 | Grad Max: 0.017070
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026913 | Grad Max: 0.113600
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.001024
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005855 | Grad Max: 0.012368
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000402
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001733 | Grad Max: 0.004034
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002500 | Grad Max: 0.004927
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044493 | Grad Max: 0.044493
[GRADIENT NORM TOTAL] 9.2853

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.474
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059948 0.4940052] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 594/1454 | B: 401/1455 | C: 267/1781
[LOSS Ex1] A: 0.66703 | B: 0.66666 | C: 0.66107
[LOGITS Ex2 A] Mean Abs: 1.665 | Max: 6.434
[LOSS Ex2] A: 0.19872 | B: 0.38393 | C: 0.31489
** [JOINT LOSS] ** : 0.964096
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002390 | Grad Max: 0.079296
  -> Layer: shared_layers.0.bias | Grad Mean: 0.044634 | Grad Max: 0.204423
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.007577
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006364 | Grad Max: 0.006364
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000468 | Grad Max: 0.093513
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007519 | Grad Max: 0.530388
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.003323
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001365 | Grad Max: 0.013921
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000153
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000212 | Grad Max: 0.001453
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000065
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000071 | Grad Max: 0.000458
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000436 | Grad Max: 0.001143
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001449 | Grad Max: 0.001449
[GRADIENT NORM TOTAL] 1.3178

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.424
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5057479  0.49425206] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.529 | Std: 0.033
[MASKS] A(Pass/Fail): 578/1470 | B: 414/1634 | C: 267/1781
[LOSS Ex1] A: 0.66574 | B: 0.66598 | C: 0.66196
[LOGITS Ex2 A] Mean Abs: 1.628 | Max: 5.871
[LOSS Ex2] A: 0.20410 | B: 0.43569 | C: 0.32650
** [JOINT LOSS] ** : 0.986655
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006143 | Grad Max: 0.157549
  -> Layer: shared_layers.0.bias | Grad Mean: 0.457860 | Grad Max: 2.027681
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.008370
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008642 | Grad Max: 0.008642
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002961 | Grad Max: 0.250930
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055775 | Grad Max: 1.369955
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.015128
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027480 | Grad Max: 0.107131
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.000960
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006020 | Grad Max: 0.013580
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000413
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001809 | Grad Max: 0.004199
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002772 | Grad Max: 0.005129
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047681 | Grad Max: 0.047681
[GRADIENT NORM TOTAL] 8.6972

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.436
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50161266 0.49838737] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.529 | Std: 0.033
[MASKS] A(Pass/Fail): 579/1469 | B: 426/1622 | C: 261/1787
[LOSS Ex1] A: 0.66478 | B: 0.66651 | C: 0.66320
[LOGITS Ex2 A] Mean Abs: 1.613 | Max: 6.102
[LOSS Ex2] A: 0.22154 | B: 0.42679 | C: 0.32075
** [JOINT LOSS] ** : 0.987856
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005046 | Grad Max: 0.157832
  -> Layer: shared_layers.0.bias | Grad Mean: 0.467063 | Grad Max: 2.086988
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.008558
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009818 | Grad Max: 0.009818
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002913 | Grad Max: 0.316020
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054865 | Grad Max: 1.773807
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000586 | Grad Max: 0.019645
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026592 | Grad Max: 0.124463
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001012
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005758 | Grad Max: 0.012334
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000405
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001718 | Grad Max: 0.003968
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002574 | Grad Max: 0.004830
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044876 | Grad Max: 0.044876
[GRADIENT NORM TOTAL] 9.1866

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.369
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50455976 0.49544024] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.527 | Std: 0.033
[MASKS] A(Pass/Fail): 540/1508 | B: 446/1602 | C: 249/1799
[LOSS Ex1] A: 0.66949 | B: 0.66346 | C: 0.66296
[LOGITS Ex2 A] Mean Abs: 1.613 | Max: 5.900
[LOSS Ex2] A: 0.19341 | B: 0.39123 | C: 0.30670
** [JOINT LOSS] ** : 0.962416
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002492 | Grad Max: 0.064327
  -> Layer: shared_layers.0.bias | Grad Mean: 0.218741 | Grad Max: 0.913266
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002060 | Grad Max: 0.007162
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004665 | Grad Max: 0.004665
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001455 | Grad Max: 0.248636
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026793 | Grad Max: 1.404232
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.009028
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013029 | Grad Max: 0.056024
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000473
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002767 | Grad Max: 0.006101
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000200
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000812 | Grad Max: 0.001945
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001200 | Grad Max: 0.002744
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020505 | Grad Max: 0.020505
[GRADIENT NORM TOTAL] 4.6488

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.277
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5245012  0.47549883] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.527 | Std: 0.032
[MASKS] A(Pass/Fail): 553/1495 | B: 403/1453 | C: 155/1221
[LOSS Ex1] A: 0.66908 | B: 0.66647 | C: 0.66592
[LOGITS Ex2 A] Mean Abs: 1.635 | Max: 5.970
[LOSS Ex2] A: 0.21214 | B: 0.37845 | C: 0.33583
** [JOINT LOSS] ** : 0.975962
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008258 | Grad Max: 0.252415
  -> Layer: shared_layers.0.bias | Grad Mean: 0.337588 | Grad Max: 1.448382
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001961 | Grad Max: 0.007340
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005246 | Grad Max: 0.005246
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002406 | Grad Max: 0.219218
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044395 | Grad Max: 1.112526
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000476 | Grad Max: 0.012732
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021203 | Grad Max: 0.085858
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000829
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004751 | Grad Max: 0.011075
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000343
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001423 | Grad Max: 0.003432
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002131 | Grad Max: 0.004316
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037407 | Grad Max: 0.037407
[GRADIENT NORM TOTAL] 6.4093

[EPOCH SUMMARY] Train Loss: 0.9758

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9619 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 65/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.425
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6107488  0.38925114] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.034
[MASKS] A(Pass/Fail): 614/1434 | B: 414/1634 | C: 290/1758
[LOSS Ex1] A: 0.66605 | B: 0.66580 | C: 0.66079
[LOGITS Ex2 A] Mean Abs: 1.683 | Max: 5.455
[LOSS Ex2] A: 0.21450 | B: 0.42130 | C: 0.33217
** [JOINT LOSS] ** : 0.986867
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011326 | Grad Max: 0.354869
  -> Layer: shared_layers.0.bias | Grad Mean: 0.485410 | Grad Max: 2.038709
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.007870
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003204 | Grad Max: 0.003204
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003363 | Grad Max: 0.297169
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.062203 | Grad Max: 1.497937
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000646 | Grad Max: 0.014661
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028684 | Grad Max: 0.105598
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001044
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006412 | Grad Max: 0.014522
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000445
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001914 | Grad Max: 0.004436
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002904 | Grad Max: 0.005034
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.049175 | Grad Max: 0.049175
[GRADIENT NORM TOTAL] 9.1639

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.475
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5001479  0.49985212] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.033
[MASKS] A(Pass/Fail): 581/1467 | B: 427/1621 | C: 251/1797
[LOSS Ex1] A: 0.66979 | B: 0.66633 | C: 0.66304
[LOGITS Ex2 A] Mean Abs: 1.651 | Max: 5.496
[LOSS Ex2] A: 0.19814 | B: 0.40233 | C: 0.31575
** [JOINT LOSS] ** : 0.971795
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005174 | Grad Max: 0.137910
  -> Layer: shared_layers.0.bias | Grad Mean: 0.283785 | Grad Max: 1.188052
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002035 | Grad Max: 0.007406
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009257 | Grad Max: 0.009257
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001922 | Grad Max: 0.178336
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036104 | Grad Max: 1.004098
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.011249
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017274 | Grad Max: 0.072557
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000598
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003821 | Grad Max: 0.008107
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000268
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001147 | Grad Max: 0.002731
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001622 | Grad Max: 0.003629
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029149 | Grad Max: 0.029149
[GRADIENT NORM TOTAL] 5.4516

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.270
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5936879  0.40631208] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.033
[MASKS] A(Pass/Fail): 569/1479 | B: 447/1601 | C: 286/1762
[LOSS Ex1] A: 0.66763 | B: 0.66328 | C: 0.66122
[LOGITS Ex2 A] Mean Abs: 1.629 | Max: 5.956
[LOSS Ex2] A: 0.20864 | B: 0.39478 | C: 0.32237
** [JOINT LOSS] ** : 0.972640
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003762 | Grad Max: 0.098554
  -> Layer: shared_layers.0.bias | Grad Mean: 0.307364 | Grad Max: 1.332000
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002115 | Grad Max: 0.007590
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001547 | Grad Max: 0.001547
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001934 | Grad Max: 0.248394
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036572 | Grad Max: 1.402051
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000383 | Grad Max: 0.010631
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017337 | Grad Max: 0.069880
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000611
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003779 | Grad Max: 0.008066
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000282
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001122 | Grad Max: 0.002862
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001690 | Grad Max: 0.003129
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028919 | Grad Max: 0.028919
[GRADIENT NORM TOTAL] 6.0970

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.349
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5582951  0.44170496] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 471/1145 | B: 404/1452 | C: 246/1802
[LOSS Ex1] A: 0.66656 | B: 0.66630 | C: 0.66306
[LOGITS Ex2 A] Mean Abs: 1.634 | Max: 5.561
[LOSS Ex2] A: 0.20386 | B: 0.40698 | C: 0.30133
** [JOINT LOSS] ** : 0.969362
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008076 | Grad Max: 0.185066
  -> Layer: shared_layers.0.bias | Grad Mean: 0.490722 | Grad Max: 2.099156
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.007842
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001632 | Grad Max: 0.001632
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003198 | Grad Max: 0.297105
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059774 | Grad Max: 1.701940
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000634 | Grad Max: 0.017274
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028454 | Grad Max: 0.115080
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000989
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006288 | Grad Max: 0.012964
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000445
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001885 | Grad Max: 0.004626
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002854 | Grad Max: 0.005078
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048802 | Grad Max: 0.048802
[GRADIENT NORM TOTAL] 9.3515

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.476
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50595844 0.49404156] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 596/1452 | B: 416/1632 | C: 265/1783
[LOSS Ex1] A: 0.66669 | B: 0.66564 | C: 0.66135
[LOGITS Ex2 A] Mean Abs: 1.645 | Max: 6.679
[LOSS Ex2] A: 0.19958 | B: 0.42054 | C: 0.33430
** [JOINT LOSS] ** : 0.982700
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006356 | Grad Max: 0.164542
  -> Layer: shared_layers.0.bias | Grad Mean: 0.343099 | Grad Max: 1.419997
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002076 | Grad Max: 0.007553
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002871 | Grad Max: 0.002871
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.229120
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042146 | Grad Max: 1.258854
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000444 | Grad Max: 0.014240
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019804 | Grad Max: 0.083138
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000738
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004398 | Grad Max: 0.009332
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000324
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001320 | Grad Max: 0.003062
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002028 | Grad Max: 0.003903
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034313 | Grad Max: 0.034313
[GRADIENT NORM TOTAL] 6.5818

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.425
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50586534 0.49413463] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 581/1467 | B: 428/1620 | C: 278/1770
[LOSS Ex1] A: 0.66540 | B: 0.66618 | C: 0.66189
[LOGITS Ex2 A] Mean Abs: 1.673 | Max: 5.582
[LOSS Ex2] A: 0.19631 | B: 0.40249 | C: 0.31929
** [JOINT LOSS] ** : 0.970520
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002793 | Grad Max: 0.080660
  -> Layer: shared_layers.0.bias | Grad Mean: 0.230138 | Grad Max: 1.095554
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.008624
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013616 | Grad Max: 0.013616
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001424 | Grad Max: 0.161714
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026212 | Grad Max: 0.904797
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000253 | Grad Max: 0.006673
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011512 | Grad Max: 0.048668
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000423
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002458 | Grad Max: 0.005846
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000193
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000720 | Grad Max: 0.001927
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000979 | Grad Max: 0.002803
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018126 | Grad Max: 0.018126
[GRADIENT NORM TOTAL] 4.5764

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.437
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015286 0.4984714] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 579/1469 | B: 447/1601 | C: 257/1791
[LOSS Ex1] A: 0.66443 | B: 0.66313 | C: 0.66173
[LOGITS Ex2 A] Mean Abs: 1.676 | Max: 7.520
[LOSS Ex2] A: 0.22187 | B: 0.39708 | C: 0.31706
** [JOINT LOSS] ** : 0.975102
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004174 | Grad Max: 0.123480
  -> Layer: shared_layers.0.bias | Grad Mean: 0.380196 | Grad Max: 1.706880
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002264 | Grad Max: 0.008275
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005990 | Grad Max: 0.005990
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002331 | Grad Max: 0.227073
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043873 | Grad Max: 1.299753
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000467 | Grad Max: 0.013448
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021236 | Grad Max: 0.091242
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000784
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004647 | Grad Max: 0.010237
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000332
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001394 | Grad Max: 0.003108
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002057 | Grad Max: 0.003928
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036292 | Grad Max: 0.036292
[GRADIENT NORM TOTAL] 7.3582

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.370
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044672  0.49553284] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.033
[MASKS] A(Pass/Fail): 543/1505 | B: 405/1451 | C: 260/1788
[LOSS Ex1] A: 0.66921 | B: 0.66615 | C: 0.66273
[LOGITS Ex2 A] Mean Abs: 1.626 | Max: 5.885
[LOSS Ex2] A: 0.19855 | B: 0.38711 | C: 0.31618
** [JOINT LOSS] ** : 0.966642
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002666 | Grad Max: 0.075299
  -> Layer: shared_layers.0.bias | Grad Mean: 0.159040 | Grad Max: 0.727755
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001919 | Grad Max: 0.006430
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004504 | Grad Max: 0.004504
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000973 | Grad Max: 0.167243
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018175 | Grad Max: 0.911852
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000179 | Grad Max: 0.006565
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007964 | Grad Max: 0.038868
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000338
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001701 | Grad Max: 0.004216
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000150
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000507 | Grad Max: 0.001385
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000667 | Grad Max: 0.002256
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012619 | Grad Max: 0.012619
[GRADIENT NORM TOTAL] 3.2006

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.282
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52478623 0.47521383] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.033
[MASKS] A(Pass/Fail): 555/1493 | B: 416/1632 | C: 262/1786
[LOSS Ex1] A: 0.66881 | B: 0.66550 | C: 0.66215
[LOGITS Ex2 A] Mean Abs: 1.564 | Max: 6.027
[LOSS Ex2] A: 0.20577 | B: 0.41533 | C: 0.33257
** [JOINT LOSS] ** : 0.983374
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005250 | Grad Max: 0.127103
  -> Layer: shared_layers.0.bias | Grad Mean: 0.292457 | Grad Max: 1.323260
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001950 | Grad Max: 0.006760
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004965 | Grad Max: 0.004965
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001931 | Grad Max: 0.204102
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035989 | Grad Max: 1.087935
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.010998
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017615 | Grad Max: 0.077400
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000616
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003906 | Grad Max: 0.008304
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000295
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001193 | Grad Max: 0.002896
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001968 | Grad Max: 0.003897
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032247 | Grad Max: 0.032247
[GRADIENT NORM TOTAL] 5.5737

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.427
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6127213  0.38727874] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.034
[MASKS] A(Pass/Fail): 616/1432 | B: 432/1616 | C: 255/1793
[LOSS Ex1] A: 0.66572 | B: 0.66604 | C: 0.66272
[LOGITS Ex2 A] Mean Abs: 1.615 | Max: 6.182
[LOSS Ex2] A: 0.19649 | B: 0.40867 | C: 0.31856
** [JOINT LOSS] ** : 0.972732
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004652 | Grad Max: 0.109814
  -> Layer: shared_layers.0.bias | Grad Mean: 0.282283 | Grad Max: 1.243654
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.008141
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012052 | Grad Max: 0.012052
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001779 | Grad Max: 0.187894
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033467 | Grad Max: 1.049320
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000367 | Grad Max: 0.010031
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016498 | Grad Max: 0.067374
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000558
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003609 | Grad Max: 0.008305
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000285
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001069 | Grad Max: 0.002744
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001575 | Grad Max: 0.003071
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026654 | Grad Max: 0.026654
[GRADIENT NORM TOTAL] 5.2534

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.477
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000769  0.49992314] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 584/1464 | B: 448/1600 | C: 261/1787
[LOSS Ex1] A: 0.66950 | B: 0.66299 | C: 0.66066
[LOGITS Ex2 A] Mean Abs: 1.647 | Max: 5.302
[LOSS Ex2] A: 0.19025 | B: 0.38215 | C: 0.32054
** [JOINT LOSS] ** : 0.962027
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001848 | Grad Max: 0.050177
  -> Layer: shared_layers.0.bias | Grad Mean: 0.030202 | Grad Max: 0.149376
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002088 | Grad Max: 0.007192
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002996 | Grad Max: 0.002996
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000357 | Grad Max: 0.089195
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005642 | Grad Max: 0.506701
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002381
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001275 | Grad Max: 0.010614
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000152
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000213 | Grad Max: 0.001276
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000061 | Grad Max: 0.000434
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000439 | Grad Max: 0.001134
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000618 | Grad Max: 0.000618
[GRADIENT NORM TOTAL] 1.0387

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.274
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59447926 0.40552074] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 569/1479 | B: 406/1450 | C: 275/1773
[LOSS Ex1] A: 0.66732 | B: 0.66601 | C: 0.66347
[LOGITS Ex2 A] Mean Abs: 1.645 | Max: 6.164
[LOSS Ex2] A: 0.21023 | B: 0.37742 | C: 0.33211
** [JOINT LOSS] ** : 0.972188
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005388 | Grad Max: 0.154283
  -> Layer: shared_layers.0.bias | Grad Mean: 0.252529 | Grad Max: 1.070743
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002028 | Grad Max: 0.007785
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006217 | Grad Max: 0.006217
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001790 | Grad Max: 0.147843
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032840 | Grad Max: 0.823122
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000352 | Grad Max: 0.009009
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015822 | Grad Max: 0.061945
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000617
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003538 | Grad Max: 0.007784
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000264
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001072 | Grad Max: 0.002522
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001744 | Grad Max: 0.003501
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029079 | Grad Max: 0.029079
[GRADIENT NORM TOTAL] 4.8649

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.352
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5594049  0.44059503] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.034
[MASKS] A(Pass/Fail): 471/1145 | B: 416/1632 | C: 256/1792
[LOSS Ex1] A: 0.66624 | B: 0.66535 | C: 0.66279
[LOGITS Ex2 A] Mean Abs: 1.678 | Max: 5.801
[LOSS Ex2] A: 0.18709 | B: 0.41132 | C: 0.29795
** [JOINT LOSS] ** : 0.963579
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004985 | Grad Max: 0.182958
  -> Layer: shared_layers.0.bias | Grad Mean: 0.109540 | Grad Max: 0.456085
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.007561
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000588 | Grad Max: 0.000588
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000966 | Grad Max: 0.184735
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017485 | Grad Max: 1.031997
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000161 | Grad Max: 0.004286
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006915 | Grad Max: 0.025382
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000321
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001586 | Grad Max: 0.004140
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000148
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000476 | Grad Max: 0.001189
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000637 | Grad Max: 0.002087
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011678 | Grad Max: 0.011678
[GRADIENT NORM TOTAL] 2.6007

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.479
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50593287 0.49406716] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.530 | Std: 0.034
[MASKS] A(Pass/Fail): 597/1451 | B: 433/1615 | C: 209/1167
[LOSS Ex1] A: 0.66636 | B: 0.66588 | C: 0.65684
[LOGITS Ex2 A] Mean Abs: 1.643 | Max: 6.297
[LOSS Ex2] A: 0.19667 | B: 0.41181 | C: 0.30431
** [JOINT LOSS] ** : 0.967289
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002870 | Grad Max: 0.084669
  -> Layer: shared_layers.0.bias | Grad Mean: 0.231597 | Grad Max: 1.115242
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.007817
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007074 | Grad Max: 0.007074
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001511 | Grad Max: 0.200498
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028080 | Grad Max: 1.135391
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000287 | Grad Max: 0.009056
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012936 | Grad Max: 0.057576
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000513
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002729 | Grad Max: 0.006147
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000215
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000803 | Grad Max: 0.002030
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001155 | Grad Max: 0.002315
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020294 | Grad Max: 0.020294
[GRADIENT NORM TOTAL] 4.7231

[EPOCH SUMMARY] Train Loss: 0.9726

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9520 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 66/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.426
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5058904 0.4941096] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 583/1465 | B: 451/1597 | C: 289/1759
[LOSS Ex1] A: 0.66503 | B: 0.66282 | C: 0.66062
[LOGITS Ex2 A] Mean Abs: 1.625 | Max: 5.893
[LOSS Ex2] A: 0.19641 | B: 0.39440 | C: 0.30985
** [JOINT LOSS] ** : 0.963042
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004613 | Grad Max: 0.122702
  -> Layer: shared_layers.0.bias | Grad Mean: 0.325293 | Grad Max: 1.479474
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.008086
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006600 | Grad Max: 0.006600
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.241020
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038822 | Grad Max: 1.350279
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.012913
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017879 | Grad Max: 0.083006
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000634
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003894 | Grad Max: 0.008235
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000292
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001156 | Grad Max: 0.002994
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001744 | Grad Max: 0.003357
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029494 | Grad Max: 0.029494
[GRADIENT NORM TOTAL] 6.3582

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.438
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50151885 0.49848112] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 581/1467 | B: 409/1447 | C: 259/1789
[LOSS Ex1] A: 0.66406 | B: 0.66585 | C: 0.66252
[LOGITS Ex2 A] Mean Abs: 1.627 | Max: 7.640
[LOSS Ex2] A: 0.21492 | B: 0.37393 | C: 0.31606
** [JOINT LOSS] ** : 0.965781
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003029 | Grad Max: 0.079194
  -> Layer: shared_layers.0.bias | Grad Mean: 0.124509 | Grad Max: 0.506105
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.008619
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007569 | Grad Max: 0.007569
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000838 | Grad Max: 0.117702
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014464 | Grad Max: 0.659862
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000134 | Grad Max: 0.006740
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005728 | Grad Max: 0.032125
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000321
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001115 | Grad Max: 0.004009
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000136
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000313 | Grad Max: 0.000992
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000461 | Grad Max: 0.001547
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006672 | Grad Max: 0.006672
[GRADIENT NORM TOTAL] 2.4229

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.371
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044358  0.49556422] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.034
[MASKS] A(Pass/Fail): 543/1505 | B: 420/1628 | C: 266/1782
[LOSS Ex1] A: 0.66888 | B: 0.66518 | C: 0.66025
[LOGITS Ex2 A] Mean Abs: 1.646 | Max: 6.049
[LOSS Ex2] A: 0.19212 | B: 0.42919 | C: 0.32158
** [JOINT LOSS] ** : 0.979069
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005735 | Grad Max: 0.137934
  -> Layer: shared_layers.0.bias | Grad Mean: 0.425242 | Grad Max: 1.817919
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001988 | Grad Max: 0.006637
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000691 | Grad Max: 0.000691
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002706 | Grad Max: 0.264196
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051141 | Grad Max: 1.433863
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000537 | Grad Max: 0.014830
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024363 | Grad Max: 0.104707
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000847
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005314 | Grad Max: 0.011783
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000380
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001588 | Grad Max: 0.003788
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002327 | Grad Max: 0.004529
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041467 | Grad Max: 0.041467
[GRADIENT NORM TOTAL] 8.1912

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.287
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5249352  0.47506478] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.033
[MASKS] A(Pass/Fail): 556/1492 | B: 434/1614 | C: 263/1785
[LOSS Ex1] A: 0.66850 | B: 0.66572 | C: 0.66075
[LOGITS Ex2 A] Mean Abs: 1.646 | Max: 6.123
[LOSS Ex2] A: 0.21378 | B: 0.42975 | C: 0.34315
** [JOINT LOSS] ** : 0.993883
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006632 | Grad Max: 0.179074
  -> Layer: shared_layers.0.bias | Grad Mean: 0.541468 | Grad Max: 2.397496
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001991 | Grad Max: 0.006964
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006098 | Grad Max: 0.006098
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003439 | Grad Max: 0.332695
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064670 | Grad Max: 1.837749
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000676 | Grad Max: 0.021379
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030847 | Grad Max: 0.142544
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001071
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006748 | Grad Max: 0.014312
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000466
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002006 | Grad Max: 0.004935
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002841 | Grad Max: 0.004987
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050227 | Grad Max: 0.050227
[GRADIENT NORM TOTAL] 10.5979

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.430
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6149688 0.3850312] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 617/1431 | B: 452/1596 | C: 286/1762
[LOSS Ex1] A: 0.66534 | B: 0.66265 | C: 0.66098
[LOGITS Ex2 A] Mean Abs: 1.672 | Max: 6.089
[LOSS Ex2] A: 0.19479 | B: 0.39525 | C: 0.33066
** [JOINT LOSS] ** : 0.969892
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003024 | Grad Max: 0.136895
  -> Layer: shared_layers.0.bias | Grad Mean: 0.335298 | Grad Max: 1.683821
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.007760
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007050 | Grad Max: 0.007050
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.219189
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038780 | Grad Max: 1.228159
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000396 | Grad Max: 0.012215
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018139 | Grad Max: 0.085413
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000662
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003909 | Grad Max: 0.008486
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000256
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001171 | Grad Max: 0.002692
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001693 | Grad Max: 0.003577
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030838 | Grad Max: 0.030838
[GRADIENT NORM TOTAL] 6.7727

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.481
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50004524 0.49995473] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 587/1461 | B: 411/1445 | C: 276/1772
[LOSS Ex1] A: 0.66916 | B: 0.66569 | C: 0.65896
[LOGITS Ex2 A] Mean Abs: 1.641 | Max: 5.521
[LOSS Ex2] A: 0.18994 | B: 0.38504 | C: 0.31102
** [JOINT LOSS] ** : 0.959936
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006520 | Grad Max: 0.234159
  -> Layer: shared_layers.0.bias | Grad Mean: 0.183907 | Grad Max: 0.628653
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.007060
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003001 | Grad Max: 0.003001
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001370 | Grad Max: 0.153331
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024446 | Grad Max: 0.845876
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000244 | Grad Max: 0.006328
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010801 | Grad Max: 0.037834
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000573
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002535 | Grad Max: 0.006797
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000209
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000762 | Grad Max: 0.001930
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001231 | Grad Max: 0.002622
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020158 | Grad Max: 0.020158
[GRADIENT NORM TOTAL] 3.5400

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.279
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59598196 0.404018  ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 573/1475 | B: 426/1622 | C: 275/1773
[LOSS Ex1] A: 0.66696 | B: 0.66503 | C: 0.66052
[LOGITS Ex2 A] Mean Abs: 1.634 | Max: 5.933
[LOSS Ex2] A: 0.21221 | B: 0.41559 | C: 0.29685
** [JOINT LOSS] ** : 0.972388
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006524 | Grad Max: 0.204694
  -> Layer: shared_layers.0.bias | Grad Mean: 0.261879 | Grad Max: 0.959455
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.007912
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007991 | Grad Max: 0.007991
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001823 | Grad Max: 0.195136
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033419 | Grad Max: 1.085478
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000344 | Grad Max: 0.009271
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015318 | Grad Max: 0.061272
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000612
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003449 | Grad Max: 0.008124
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000239
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001019 | Grad Max: 0.002519
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001386 | Grad Max: 0.002927
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024394 | Grad Max: 0.024394
[GRADIENT NORM TOTAL] 5.0210

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.357
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5607465  0.43925354] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 475/1141 | B: 435/1613 | C: 262/1786
[LOSS Ex1] A: 0.66589 | B: 0.66557 | C: 0.66196
[LOGITS Ex2 A] Mean Abs: 1.684 | Max: 5.843
[LOSS Ex2] A: 0.17392 | B: 0.40210 | C: 0.32217
** [JOINT LOSS] ** : 0.963873
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003314 | Grad Max: 0.103221
  -> Layer: shared_layers.0.bias | Grad Mean: 0.048429 | Grad Max: 0.213898
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.007038
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003554 | Grad Max: 0.003554
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000491 | Grad Max: 0.048068
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007927 | Grad Max: 0.217881
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000078 | Grad Max: 0.003103
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003092 | Grad Max: 0.017648
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000171
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000745 | Grad Max: 0.002318
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000116
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000241 | Grad Max: 0.000979
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000579 | Grad Max: 0.001735
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008147 | Grad Max: 0.008147
[GRADIENT NORM TOTAL] 1.1839

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.482
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059108 0.4940892] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 598/1450 | B: 454/1594 | C: 244/1804
[LOSS Ex1] A: 0.66601 | B: 0.66249 | C: 0.66194
[LOGITS Ex2 A] Mean Abs: 1.693 | Max: 6.582
[LOSS Ex2] A: 0.21052 | B: 0.39825 | C: 0.33048
** [JOINT LOSS] ** : 0.976564
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008877 | Grad Max: 0.236564
  -> Layer: shared_layers.0.bias | Grad Mean: 0.432353 | Grad Max: 1.828140
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.007801
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004902 | Grad Max: 0.004902
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002877 | Grad Max: 0.278094
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053580 | Grad Max: 1.413169
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000556 | Grad Max: 0.014159
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025043 | Grad Max: 0.098580
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000859
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005559 | Grad Max: 0.011790
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000384
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001667 | Grad Max: 0.003931
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002462 | Grad Max: 0.004738
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043154 | Grad Max: 0.043154
[GRADIENT NORM TOTAL] 8.2773

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.428
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059334 0.4940666] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 583/1465 | B: 411/1445 | C: 270/1778
[LOSS Ex1] A: 0.66467 | B: 0.66553 | C: 0.65925
[LOGITS Ex2 A] Mean Abs: 1.711 | Max: 5.995
[LOSS Ex2] A: 0.22271 | B: 0.38954 | C: 0.30329
** [JOINT LOSS] ** : 0.968337
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012668 | Grad Max: 0.367307
  -> Layer: shared_layers.0.bias | Grad Mean: 0.518921 | Grad Max: 2.362865
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.008428
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009079 | Grad Max: 0.009079
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003529 | Grad Max: 0.350315
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065864 | Grad Max: 1.723576
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000695 | Grad Max: 0.019574
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031258 | Grad Max: 0.126353
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000096 | Grad Max: 0.001087
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006991 | Grad Max: 0.014853
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000456
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002104 | Grad Max: 0.004701
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003209 | Grad Max: 0.006171
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.055452 | Grad Max: 0.055452
[GRADIENT NORM TOTAL] 9.7655

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.441
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015107  0.49848932] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 581/1467 | B: 427/1621 | C: 277/1771
[LOSS Ex1] A: 0.66371 | B: 0.66487 | C: 0.66150
[LOGITS Ex2 A] Mean Abs: 1.687 | Max: 6.553
[LOSS Ex2] A: 0.24194 | B: 0.41068 | C: 0.32364
** [JOINT LOSS] ** : 0.988781
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011318 | Grad Max: 0.412218
  -> Layer: shared_layers.0.bias | Grad Mean: 0.285211 | Grad Max: 1.134109
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002143 | Grad Max: 0.008066
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004585 | Grad Max: 0.004585
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.204541
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037840 | Grad Max: 0.894880
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000400 | Grad Max: 0.008820
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017541 | Grad Max: 0.059673
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000715
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004088 | Grad Max: 0.009000
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000307
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001243 | Grad Max: 0.003192
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001956 | Grad Max: 0.003426
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033137 | Grad Max: 0.033137
[GRADIENT NORM TOTAL] 5.4319

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.373
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044551 0.4955449] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.528 | Std: 0.034
[MASKS] A(Pass/Fail): 544/1504 | B: 436/1612 | C: 251/1797
[LOSS Ex1] A: 0.66858 | B: 0.66541 | C: 0.66269
[LOGITS Ex2 A] Mean Abs: 1.610 | Max: 6.349
[LOSS Ex2] A: 0.19685 | B: 0.40768 | C: 0.31119
** [JOINT LOSS] ** : 0.970801
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004767 | Grad Max: 0.123688
  -> Layer: shared_layers.0.bias | Grad Mean: 0.332387 | Grad Max: 1.615269
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001945 | Grad Max: 0.006715
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000568 | Grad Max: 0.000568
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002194 | Grad Max: 0.190674
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039335 | Grad Max: 1.076508
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000377 | Grad Max: 0.011808
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017127 | Grad Max: 0.079561
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000596
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003570 | Grad Max: 0.007995
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000268
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001039 | Grad Max: 0.002662
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001436 | Grad Max: 0.002825
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025627 | Grad Max: 0.025627
[GRADIENT NORM TOTAL] 6.7263

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.291
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.525393 0.474607] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.528 | Std: 0.033
[MASKS] A(Pass/Fail): 557/1491 | B: 455/1593 | C: 252/1796
[LOSS Ex1] A: 0.66821 | B: 0.66233 | C: 0.66472
[LOGITS Ex2 A] Mean Abs: 1.549 | Max: 5.825
[LOSS Ex2] A: 0.20649 | B: 0.42296 | C: 0.33631
** [JOINT LOSS] ** : 0.987010
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004738 | Grad Max: 0.153617
  -> Layer: shared_layers.0.bias | Grad Mean: 0.461979 | Grad Max: 1.983481
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.007579
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005645 | Grad Max: 0.005645
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002963 | Grad Max: 0.325120
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055059 | Grad Max: 1.801094
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000577 | Grad Max: 0.018141
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026458 | Grad Max: 0.126090
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000979
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005626 | Grad Max: 0.012845
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000398
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001667 | Grad Max: 0.004155
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002265 | Grad Max: 0.004588
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042347 | Grad Max: 0.042347
[GRADIENT NORM TOTAL] 9.2701

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.433
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61684036 0.38315967] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.531 | Std: 0.035
[MASKS] A(Pass/Fail): 619/1429 | B: 413/1443 | C: 173/1203
[LOSS Ex1] A: 0.66503 | B: 0.66539 | C: 0.66079
[LOGITS Ex2 A] Mean Abs: 1.619 | Max: 5.710
[LOSS Ex2] A: 0.19746 | B: 0.39914 | C: 0.34484
** [JOINT LOSS] ** : 0.977549
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004294 | Grad Max: 0.168766
  -> Layer: shared_layers.0.bias | Grad Mean: 0.482466 | Grad Max: 2.182149
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.007658
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002675 | Grad Max: 0.002675
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003043 | Grad Max: 0.333194
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056788 | Grad Max: 1.887400
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000593 | Grad Max: 0.017666
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027114 | Grad Max: 0.120609
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.001023
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005772 | Grad Max: 0.013336
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000397
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001709 | Grad Max: 0.004127
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002373 | Grad Max: 0.004370
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043563 | Grad Max: 0.043563
[GRADIENT NORM TOTAL] 9.7871

[EPOCH SUMMARY] Train Loss: 0.9741

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9458 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9504 -> New: 0.9458)

############################## EPOCH 67/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.484
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50011206 0.49988794] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.035
[MASKS] A(Pass/Fail): 587/1461 | B: 429/1619 | C: 280/1768
[LOSS Ex1] A: 0.66887 | B: 0.66473 | C: 0.66006
[LOGITS Ex2 A] Mean Abs: 1.649 | Max: 6.170
[LOSS Ex2] A: 0.18221 | B: 0.41058 | C: 0.32086
** [JOINT LOSS] ** : 0.969099
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002399 | Grad Max: 0.053266
  -> Layer: shared_layers.0.bias | Grad Mean: 0.090293 | Grad Max: 0.510388
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002012 | Grad Max: 0.006817
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003652 | Grad Max: 0.003652
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000661 | Grad Max: 0.123504
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011470 | Grad Max: 0.694173
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.003925
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002999 | Grad Max: 0.023637
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000611 | Grad Max: 0.002745
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000092
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000171 | Grad Max: 0.000709
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.001078
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003591 | Grad Max: 0.003591
[GRADIENT NORM TOTAL] 2.2006

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.283
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59740543 0.40259457] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.035
[MASKS] A(Pass/Fail): 573/1475 | B: 439/1609 | C: 272/1776
[LOSS Ex1] A: 0.66664 | B: 0.66527 | C: 0.65910
[LOGITS Ex2 A] Mean Abs: 1.695 | Max: 5.659
[LOSS Ex2] A: 0.21086 | B: 0.41786 | C: 0.31818
** [JOINT LOSS] ** : 0.979304
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006643 | Grad Max: 0.162922
  -> Layer: shared_layers.0.bias | Grad Mean: 0.415641 | Grad Max: 1.863426
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.006968
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002163 | Grad Max: 0.002163
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002748 | Grad Max: 0.247129
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051581 | Grad Max: 1.394622
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.014685
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024551 | Grad Max: 0.101955
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000894
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005409 | Grad Max: 0.012159
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000374
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001611 | Grad Max: 0.003818
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002296 | Grad Max: 0.004399
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041366 | Grad Max: 0.041366
[GRADIENT NORM TOTAL] 8.2999

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.361
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.56159204 0.438408  ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 476/1140 | B: 456/1592 | C: 274/1774
[LOSS Ex1] A: 0.66555 | B: 0.66219 | C: 0.66025
[LOGITS Ex2 A] Mean Abs: 1.728 | Max: 5.599
[LOSS Ex2] A: 0.20553 | B: 0.40699 | C: 0.34919
** [JOINT LOSS] ** : 0.983234
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006499 | Grad Max: 0.206623
  -> Layer: shared_layers.0.bias | Grad Mean: 0.579609 | Grad Max: 2.650167
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.008080
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004595 | Grad Max: 0.004595
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003705 | Grad Max: 0.329762
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070167 | Grad Max: 1.857152
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000738 | Grad Max: 0.022392
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033847 | Grad Max: 0.158605
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001200
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007356 | Grad Max: 0.016042
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000475
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002201 | Grad Max: 0.005158
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003162 | Grad Max: 0.005390
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.056716 | Grad Max: 0.056716
[GRADIENT NORM TOTAL] 11.4410

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.485
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5060037  0.49399635] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 598/1450 | B: 413/1443 | C: 265/1783
[LOSS Ex1] A: 0.66569 | B: 0.66526 | C: 0.66199
[LOGITS Ex2 A] Mean Abs: 1.667 | Max: 7.011
[LOSS Ex2] A: 0.20253 | B: 0.38794 | C: 0.31756
** [JOINT LOSS] ** : 0.966986
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003505 | Grad Max: 0.143750
  -> Layer: shared_layers.0.bias | Grad Mean: 0.358644 | Grad Max: 1.764086
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.007630
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007420 | Grad Max: 0.007420
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.218800
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041860 | Grad Max: 1.228267
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000426 | Grad Max: 0.013466
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019660 | Grad Max: 0.087307
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000673
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004236 | Grad Max: 0.009195
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000289
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001269 | Grad Max: 0.003020
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001723 | Grad Max: 0.003859
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032660 | Grad Max: 0.032660
[GRADIENT NORM TOTAL] 7.2667

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.431
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50599015 0.49400988] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 583/1465 | B: 434/1614 | C: 256/1792
[LOSS Ex1] A: 0.66432 | B: 0.66459 | C: 0.66202
[LOGITS Ex2 A] Mean Abs: 1.638 | Max: 5.875
[LOSS Ex2] A: 0.20249 | B: 0.40693 | C: 0.30385
** [JOINT LOSS] ** : 0.968068
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005448 | Grad Max: 0.149128
  -> Layer: shared_layers.0.bias | Grad Mean: 0.153919 | Grad Max: 0.564789
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.008775
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013267 | Grad Max: 0.013267
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001107 | Grad Max: 0.215564
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020138 | Grad Max: 1.214542
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000195 | Grad Max: 0.005319
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008398 | Grad Max: 0.032701
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000401
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001910 | Grad Max: 0.004815
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000171
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000565 | Grad Max: 0.001606
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000888 | Grad Max: 0.002230
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013739 | Grad Max: 0.013739
[GRADIENT NORM TOTAL] 3.2810

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.444
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015388 0.4984612] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 581/1467 | B: 439/1609 | C: 301/1747
[LOSS Ex1] A: 0.66337 | B: 0.66514 | C: 0.65909
[LOGITS Ex2 A] Mean Abs: 1.623 | Max: 7.703
[LOSS Ex2] A: 0.21622 | B: 0.40846 | C: 0.31260
** [JOINT LOSS] ** : 0.974964
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006846 | Grad Max: 0.202690
  -> Layer: shared_layers.0.bias | Grad Mean: 0.275538 | Grad Max: 1.169728
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.007515
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003143 | Grad Max: 0.003143
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001834 | Grad Max: 0.235046
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034830 | Grad Max: 1.329097
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.010348
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016415 | Grad Max: 0.070580
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000567
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003640 | Grad Max: 0.008354
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000266
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001089 | Grad Max: 0.002577
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001739 | Grad Max: 0.003037
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028716 | Grad Max: 0.028716
[GRADIENT NORM TOTAL] 5.2766

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.376
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50445807 0.4955419 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 545/1503 | B: 458/1590 | C: 249/1799
[LOSS Ex1] A: 0.66830 | B: 0.66205 | C: 0.66119
[LOGITS Ex2 A] Mean Abs: 1.621 | Max: 6.035
[LOSS Ex2] A: 0.20018 | B: 0.38381 | C: 0.32631
** [JOINT LOSS] ** : 0.967283
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004947 | Grad Max: 0.164636
  -> Layer: shared_layers.0.bias | Grad Mean: 0.094819 | Grad Max: 0.468058
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.006629
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001133 | Grad Max: 0.001133
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000844 | Grad Max: 0.063062
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014784 | Grad Max: 0.260678
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000154 | Grad Max: 0.004595
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006759 | Grad Max: 0.026568
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000320
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001587 | Grad Max: 0.004051
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000175
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000482 | Grad Max: 0.001604
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000896 | Grad Max: 0.002317
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013618 | Grad Max: 0.013618
[GRADIENT NORM TOTAL] 2.0048

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.295
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52572364 0.4742764 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 559/1489 | B: 413/1443 | C: 271/1777
[LOSS Ex1] A: 0.66794 | B: 0.66512 | C: 0.66069
[LOGITS Ex2 A] Mean Abs: 1.631 | Max: 5.899
[LOSS Ex2] A: 0.20572 | B: 0.38284 | C: 0.30874
** [JOINT LOSS] ** : 0.963689
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005639 | Grad Max: 0.146123
  -> Layer: shared_layers.0.bias | Grad Mean: 0.434020 | Grad Max: 1.853808
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001956 | Grad Max: 0.007119
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000533 | Grad Max: 0.000533
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002838 | Grad Max: 0.306358
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053613 | Grad Max: 1.760883
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000550 | Grad Max: 0.015775
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025209 | Grad Max: 0.106029
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000819
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005522 | Grad Max: 0.011846
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000362
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001661 | Grad Max: 0.003915
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002372 | Grad Max: 0.004984
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043370 | Grad Max: 0.043370
[GRADIENT NORM TOTAL] 8.7208

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.436
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6185442  0.38145578] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.531 | Std: 0.035
[MASKS] A(Pass/Fail): 619/1429 | B: 434/1614 | C: 252/1796
[LOSS Ex1] A: 0.66472 | B: 0.66445 | C: 0.66165
[LOGITS Ex2 A] Mean Abs: 1.690 | Max: 5.372
[LOSS Ex2] A: 0.21203 | B: 0.44284 | C: 0.33142
** [JOINT LOSS] ** : 0.992371
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012720 | Grad Max: 0.333733
  -> Layer: shared_layers.0.bias | Grad Mean: 0.683264 | Grad Max: 2.889529
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.008165
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009632 | Grad Max: 0.009632
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004620 | Grad Max: 0.403848
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.086003 | Grad Max: 2.106580
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000891 | Grad Max: 0.023642
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.040547 | Grad Max: 0.162252
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000120 | Grad Max: 0.001354
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009014 | Grad Max: 0.019489
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000584
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002698 | Grad Max: 0.006226
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003864 | Grad Max: 0.006615
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.068802 | Grad Max: 0.068802
[GRADIENT NORM TOTAL] 13.1350

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.488
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5001039 0.4998961] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 593/1455 | B: 442/1606 | C: 283/1765
[LOSS Ex1] A: 0.66859 | B: 0.66500 | C: 0.65763
[LOGITS Ex2 A] Mean Abs: 1.694 | Max: 5.680
[LOSS Ex2] A: 0.20112 | B: 0.41550 | C: 0.32325
** [JOINT LOSS] ** : 0.977028
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008744 | Grad Max: 0.199699
  -> Layer: shared_layers.0.bias | Grad Mean: 0.472219 | Grad Max: 2.059389
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.006946
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000191 | Grad Max: 0.000191
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003140 | Grad Max: 0.311606
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058877 | Grad Max: 1.695468
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000619 | Grad Max: 0.017089
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028266 | Grad Max: 0.125269
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000972
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006240 | Grad Max: 0.013446
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000444
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001861 | Grad Max: 0.004482
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002590 | Grad Max: 0.004645
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046848 | Grad Max: 0.046848
[GRADIENT NORM TOTAL] 9.1221

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.287
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5987494  0.40125066] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 575/1473 | B: 459/1589 | C: 235/1813
[LOSS Ex1] A: 0.66635 | B: 0.66190 | C: 0.66337
[LOGITS Ex2 A] Mean Abs: 1.654 | Max: 5.646
[LOSS Ex2] A: 0.20620 | B: 0.38971 | C: 0.32076
** [JOINT LOSS] ** : 0.969430
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004014 | Grad Max: 0.139128
  -> Layer: shared_layers.0.bias | Grad Mean: 0.059880 | Grad Max: 0.232132
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.007573
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006032 | Grad Max: 0.006032
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000563 | Grad Max: 0.148873
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008982 | Grad Max: 0.840080
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.003330
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001983 | Grad Max: 0.014456
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000184
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000445 | Grad Max: 0.001741
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000077
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000143 | Grad Max: 0.000583
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000335 | Grad Max: 0.001238
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004666 | Grad Max: 0.004666
[GRADIENT NORM TOTAL] 1.7570

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.365
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5623655  0.43763456] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.531 | Std: 0.035
[MASKS] A(Pass/Fail): 476/1140 | B: 413/1443 | C: 258/1790
[LOSS Ex1] A: 0.66524 | B: 0.66497 | C: 0.66195
[LOGITS Ex2 A] Mean Abs: 1.625 | Max: 5.720
[LOSS Ex2] A: 0.18999 | B: 0.41450 | C: 0.32006
** [JOINT LOSS] ** : 0.972239
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005479 | Grad Max: 0.181313
  -> Layer: shared_layers.0.bias | Grad Mean: 0.553273 | Grad Max: 2.444166
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.007611
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002134 | Grad Max: 0.002134
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003402 | Grad Max: 0.335782
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064443 | Grad Max: 1.893171
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000696 | Grad Max: 0.020873
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.032160 | Grad Max: 0.145374
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001030
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006997 | Grad Max: 0.014644
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000528
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002083 | Grad Max: 0.005253
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002900 | Grad Max: 0.005392
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052937 | Grad Max: 0.052937
[GRADIENT NORM TOTAL] 10.7714

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.489
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5060092  0.49399072] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.531 | Std: 0.036
[MASKS] A(Pass/Fail): 599/1449 | B: 435/1613 | C: 269/1779
[LOSS Ex1] A: 0.66539 | B: 0.66430 | C: 0.65972
[LOGITS Ex2 A] Mean Abs: 1.615 | Max: 5.956
[LOSS Ex2] A: 0.20185 | B: 0.46517 | C: 0.34574
** [JOINT LOSS] ** : 1.000722
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009133 | Grad Max: 0.245748
  -> Layer: shared_layers.0.bias | Grad Mean: 0.747361 | Grad Max: 3.168656
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.007903
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004635 | Grad Max: 0.004635
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004697 | Grad Max: 0.458427
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.088538 | Grad Max: 2.608337
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000926 | Grad Max: 0.026644
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.042789 | Grad Max: 0.184836
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000124 | Grad Max: 0.001405
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009424 | Grad Max: 0.020174
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.000635
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002831 | Grad Max: 0.006886
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003920 | Grad Max: 0.008004
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.072617 | Grad Max: 0.072617
[GRADIENT NORM TOTAL] 14.4816

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.435
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50610363 0.49389634] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.530 | Std: 0.036
[MASKS] A(Pass/Fail): 585/1463 | B: 442/1606 | C: 170/1206
[LOSS Ex1] A: 0.66400 | B: 0.66485 | C: 0.66190
[LOGITS Ex2 A] Mean Abs: 1.620 | Max: 6.100
[LOSS Ex2] A: 0.19300 | B: 0.43257 | C: 0.33592
** [JOINT LOSS] ** : 0.984079
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006243 | Grad Max: 0.191738
  -> Layer: shared_layers.0.bias | Grad Mean: 0.566292 | Grad Max: 2.451530
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.007459
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004910 | Grad Max: 0.004910
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003559 | Grad Max: 0.364475
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.067478 | Grad Max: 2.074544
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000709 | Grad Max: 0.019414
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.032789 | Grad Max: 0.138471
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.001071
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007183 | Grad Max: 0.015100
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000495
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002150 | Grad Max: 0.005178
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003051 | Grad Max: 0.006012
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.055575 | Grad Max: 0.055575
[GRADIENT NORM TOTAL] 11.0709

[EPOCH SUMMARY] Train Loss: 0.9763

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9441 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9458 -> New: 0.9441)

############################## EPOCH 68/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.448
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015115  0.49848846] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 586/1462 | B: 459/1589 | C: 280/1768
[LOSS Ex1] A: 0.66305 | B: 0.66175 | C: 0.65826
[LOGITS Ex2 A] Mean Abs: 1.631 | Max: 7.201
[LOSS Ex2] A: 0.20463 | B: 0.38928 | C: 0.32248
** [JOINT LOSS] ** : 0.966483
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002385 | Grad Max: 0.066120
  -> Layer: shared_layers.0.bias | Grad Mean: 0.120142 | Grad Max: 0.564941
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002284 | Grad Max: 0.007813
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000584 | Grad Max: 0.000584
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000858 | Grad Max: 0.131564
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014884 | Grad Max: 0.733691
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.004560
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005930 | Grad Max: 0.030848
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000297
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001254 | Grad Max: 0.003427
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000108
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000369 | Grad Max: 0.000978
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000621 | Grad Max: 0.001658
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010125 | Grad Max: 0.010125
[GRADIENT NORM TOTAL] 2.5889

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.379
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50443184 0.49556813] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.035
[MASKS] A(Pass/Fail): 548/1500 | B: 413/1443 | C: 257/1791
[LOSS Ex1] A: 0.66802 | B: 0.66483 | C: 0.66105
[LOGITS Ex2 A] Mean Abs: 1.677 | Max: 5.549
[LOSS Ex2] A: 0.20739 | B: 0.39912 | C: 0.32727
** [JOINT LOSS] ** : 0.975891
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007720 | Grad Max: 0.191644
  -> Layer: shared_layers.0.bias | Grad Mean: 0.582976 | Grad Max: 2.519759
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001992 | Grad Max: 0.006826
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004354 | Grad Max: 0.004354
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003705 | Grad Max: 0.337070
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070283 | Grad Max: 1.867501
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000740 | Grad Max: 0.022600
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034054 | Grad Max: 0.155388
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000098 | Grad Max: 0.001039
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007465 | Grad Max: 0.015408
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000472
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002247 | Grad Max: 0.005051
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003156 | Grad Max: 0.005639
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057394 | Grad Max: 0.057395
[GRADIENT NORM TOTAL] 11.3001

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.299
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5260454  0.47395465] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.529 | Std: 0.034
[MASKS] A(Pass/Fail): 560/1488 | B: 436/1612 | C: 243/1805
[LOSS Ex1] A: 0.66767 | B: 0.66415 | C: 0.66295
[LOGITS Ex2 A] Mean Abs: 1.672 | Max: 6.633
[LOSS Ex2] A: 0.21987 | B: 0.45504 | C: 0.33638
** [JOINT LOSS] ** : 1.002023
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011070 | Grad Max: 0.284471
  -> Layer: shared_layers.0.bias | Grad Mean: 0.843160 | Grad Max: 3.741110
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001936 | Grad Max: 0.006986
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000002 | Grad Max: 0.000002
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005367 | Grad Max: 0.476806
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.101836 | Grad Max: 2.563154
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001065 | Grad Max: 0.032972
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.049219 | Grad Max: 0.226917
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000141 | Grad Max: 0.001613
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010772 | Grad Max: 0.022931
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000062 | Grad Max: 0.000693
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003221 | Grad Max: 0.007497
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004562 | Grad Max: 0.008062
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.082102 | Grad Max: 0.082102
[GRADIENT NORM TOTAL] 16.2230

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.439
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62028205 0.37971792] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.531 | Std: 0.036
[MASKS] A(Pass/Fail): 619/1429 | B: 443/1605 | C: 289/1759
[LOSS Ex1] A: 0.66440 | B: 0.66471 | C: 0.65837
[LOGITS Ex2 A] Mean Abs: 1.697 | Max: 6.092
[LOSS Ex2] A: 0.21249 | B: 0.45013 | C: 0.34532
** [JOINT LOSS] ** : 0.998470
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008916 | Grad Max: 0.279665
  -> Layer: shared_layers.0.bias | Grad Mean: 0.823246 | Grad Max: 3.630442
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.008610
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013536 | Grad Max: 0.013536
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005226 | Grad Max: 0.490289
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.098996 | Grad Max: 2.731357
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001029 | Grad Max: 0.031191
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047623 | Grad Max: 0.214221
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000136 | Grad Max: 0.001513
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010411 | Grad Max: 0.021773
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000060 | Grad Max: 0.000656
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003122 | Grad Max: 0.007424
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004386 | Grad Max: 0.007532
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.079350 | Grad Max: 0.079350
[GRADIENT NORM TOTAL] 16.1818

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.491
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500085 0.499915] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 598/1450 | B: 459/1589 | C: 289/1759
[LOSS Ex1] A: 0.66832 | B: 0.66161 | C: 0.65761
[LOGITS Ex2 A] Mean Abs: 1.680 | Max: 5.536
[LOSS Ex2] A: 0.19965 | B: 0.39731 | C: 0.29216
** [JOINT LOSS] ** : 0.958886
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004611 | Grad Max: 0.147308
  -> Layer: shared_layers.0.bias | Grad Mean: 0.392690 | Grad Max: 1.854511
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.007005
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003110 | Grad Max: 0.003110
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002539 | Grad Max: 0.263986
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046285 | Grad Max: 1.502786
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000466 | Grad Max: 0.015448
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021493 | Grad Max: 0.104800
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000689
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004552 | Grad Max: 0.009319
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000290
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001373 | Grad Max: 0.003139
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001925 | Grad Max: 0.004391
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036190 | Grad Max: 0.036190
[GRADIENT NORM TOTAL] 7.8991

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.291
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.60016036 0.3998396 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.036
[MASKS] A(Pass/Fail): 575/1473 | B: 413/1443 | C: 268/1780
[LOSS Ex1] A: 0.66605 | B: 0.66471 | C: 0.65846
[LOGITS Ex2 A] Mean Abs: 1.628 | Max: 5.961
[LOSS Ex2] A: 0.20960 | B: 0.37932 | C: 0.30977
** [JOINT LOSS] ** : 0.962634
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006934 | Grad Max: 0.257121
  -> Layer: shared_layers.0.bias | Grad Mean: 0.239068 | Grad Max: 0.909150
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.007303
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001624 | Grad Max: 0.001624
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001664 | Grad Max: 0.197969
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030316 | Grad Max: 0.958885
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000312 | Grad Max: 0.007571
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013929 | Grad Max: 0.052399
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000560
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003225 | Grad Max: 0.007714
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000248
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000970 | Grad Max: 0.002387
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001344 | Grad Max: 0.003051
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024323 | Grad Max: 0.024323
[GRADIENT NORM TOTAL] 4.4524

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.368
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.56327045 0.43672958] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.531 | Std: 0.036
[MASKS] A(Pass/Fail): 476/1140 | B: 437/1611 | C: 250/1798
[LOSS Ex1] A: 0.66494 | B: 0.66404 | C: 0.66263
[LOGITS Ex2 A] Mean Abs: 1.632 | Max: 5.869
[LOSS Ex2] A: 0.20380 | B: 0.41843 | C: 0.34120
** [JOINT LOSS] ** : 0.985009
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009507 | Grad Max: 0.273048
  -> Layer: shared_layers.0.bias | Grad Mean: 0.484231 | Grad Max: 2.026432
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.007666
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002615 | Grad Max: 0.002615
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003222 | Grad Max: 0.359590
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060080 | Grad Max: 1.964628
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000636 | Grad Max: 0.018524
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028909 | Grad Max: 0.128727
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.000946
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006446 | Grad Max: 0.014113
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000430
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001948 | Grad Max: 0.004641
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002802 | Grad Max: 0.005270
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.049928 | Grad Max: 0.049928
[GRADIENT NORM TOTAL] 9.2322

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.492
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50600916 0.4939908 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.531 | Std: 0.036
[MASKS] A(Pass/Fail): 601/1447 | B: 443/1605 | C: 258/1790
[LOSS Ex1] A: 0.66511 | B: 0.66460 | C: 0.65990
[LOGITS Ex2 A] Mean Abs: 1.642 | Max: 6.457
[LOSS Ex2] A: 0.19250 | B: 0.42016 | C: 0.30521
** [JOINT LOSS] ** : 0.969159
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007877 | Grad Max: 0.214519
  -> Layer: shared_layers.0.bias | Grad Mean: 0.383044 | Grad Max: 1.600721
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002120 | Grad Max: 0.007673
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006045 | Grad Max: 0.006045
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002506 | Grad Max: 0.260541
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046968 | Grad Max: 1.369313
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000496 | Grad Max: 0.015327
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022665 | Grad Max: 0.095142
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000771
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005064 | Grad Max: 0.010655
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000352
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001516 | Grad Max: 0.003788
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002140 | Grad Max: 0.003826
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038270 | Grad Max: 0.038270
[GRADIENT NORM TOTAL] 7.1035

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.438
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50625396 0.493746  ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.531 | Std: 0.036
[MASKS] A(Pass/Fail): 585/1463 | B: 461/1587 | C: 255/1793
[LOSS Ex1] A: 0.66370 | B: 0.66150 | C: 0.66204
[LOGITS Ex2 A] Mean Abs: 1.653 | Max: 6.771
[LOSS Ex2] A: 0.18623 | B: 0.38304 | C: 0.32547
** [JOINT LOSS] ** : 0.960660
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002324 | Grad Max: 0.061359
  -> Layer: shared_layers.0.bias | Grad Mean: 0.115564 | Grad Max: 0.612808
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002174 | Grad Max: 0.007825
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003718 | Grad Max: 0.003718
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000745 | Grad Max: 0.087430
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013674 | Grad Max: 0.470574
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.004736
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005948 | Grad Max: 0.029527
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000264
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001298 | Grad Max: 0.003595
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000119
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000395 | Grad Max: 0.001188
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000557 | Grad Max: 0.002143
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010874 | Grad Max: 0.010874
[GRADIENT NORM TOTAL] 2.2727

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.451
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015183  0.49848166] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.531 | Std: 0.036
[MASKS] A(Pass/Fail): 587/1461 | B: 415/1441 | C: 278/1770
[LOSS Ex1] A: 0.66276 | B: 0.66460 | C: 0.65858
[LOGITS Ex2 A] Mean Abs: 1.652 | Max: 6.782
[LOSS Ex2] A: 0.21619 | B: 0.38163 | C: 0.31844
** [JOINT LOSS] ** : 0.967397
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005416 | Grad Max: 0.171184
  -> Layer: shared_layers.0.bias | Grad Mean: 0.228090 | Grad Max: 0.924497
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.007583
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001025 | Grad Max: 0.001025
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001630 | Grad Max: 0.145437
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029466 | Grad Max: 0.805916
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000297 | Grad Max: 0.007714
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013333 | Grad Max: 0.051573
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000507
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003049 | Grad Max: 0.006832
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000224
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000943 | Grad Max: 0.002372
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001336 | Grad Max: 0.003066
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024455 | Grad Max: 0.024455
[GRADIENT NORM TOTAL] 4.4064

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.381
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044405  0.49555948] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.529 | Std: 0.035
[MASKS] A(Pass/Fail): 551/1497 | B: 439/1609 | C: 252/1796
[LOSS Ex1] A: 0.66776 | B: 0.66392 | C: 0.66155
[LOGITS Ex2 A] Mean Abs: 1.614 | Max: 5.687
[LOSS Ex2] A: 0.19442 | B: 0.40874 | C: 0.31345
** [JOINT LOSS] ** : 0.969948
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.061532
  -> Layer: shared_layers.0.bias | Grad Mean: 0.060791 | Grad Max: 0.246718
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001945 | Grad Max: 0.006308
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000185 | Grad Max: 0.000185
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000466 | Grad Max: 0.120164
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007593 | Grad Max: 0.675867
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003978
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001743 | Grad Max: 0.022227
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000140
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000306 | Grad Max: 0.001849
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000083
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000087 | Grad Max: 0.000639
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000385 | Grad Max: 0.001257
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001199 | Grad Max: 0.001199
[GRADIENT NORM TOTAL] 1.6036

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.302
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5263743 0.4736257] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.529 | Std: 0.035
[MASKS] A(Pass/Fail): 561/1487 | B: 444/1604 | C: 282/1766
[LOSS Ex1] A: 0.66743 | B: 0.66447 | C: 0.65869
[LOGITS Ex2 A] Mean Abs: 1.553 | Max: 6.092
[LOSS Ex2] A: 0.19826 | B: 0.40576 | C: 0.33016
** [JOINT LOSS] ** : 0.974922
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002451 | Grad Max: 0.053337
  -> Layer: shared_layers.0.bias | Grad Mean: 0.161242 | Grad Max: 0.721753
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002091 | Grad Max: 0.006824
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007561 | Grad Max: 0.007561
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001119 | Grad Max: 0.167833
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020337 | Grad Max: 0.942593
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000204 | Grad Max: 0.007871
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009370 | Grad Max: 0.050296
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000317
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002005 | Grad Max: 0.005101
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000165
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000599 | Grad Max: 0.001577
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000883 | Grad Max: 0.002238
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015485 | Grad Max: 0.015485
[GRADIENT NORM TOTAL] 3.3499

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.442
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6218216  0.37817842] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.532 | Std: 0.036
[MASKS] A(Pass/Fail): 619/1429 | B: 461/1587 | C: 257/1791
[LOSS Ex1] A: 0.66410 | B: 0.66135 | C: 0.66220
[LOGITS Ex2 A] Mean Abs: 1.633 | Max: 6.429
[LOSS Ex2] A: 0.18171 | B: 0.38327 | C: 0.32229
** [JOINT LOSS] ** : 0.958309
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003132 | Grad Max: 0.069792
  -> Layer: shared_layers.0.bias | Grad Mean: 0.101516 | Grad Max: 0.353820
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002251 | Grad Max: 0.008699
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012697 | Grad Max: 0.012697
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000731 | Grad Max: 0.093672
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013181 | Grad Max: 0.519939
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.005069
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005410 | Grad Max: 0.025186
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000277
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001225 | Grad Max: 0.003226
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000142
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000381 | Grad Max: 0.001323
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000639 | Grad Max: 0.002209
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010846 | Grad Max: 0.010846
[GRADIENT NORM TOTAL] 2.0588

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.495
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50009936 0.49990064] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.036
[MASKS] A(Pass/Fail): 601/1447 | B: 419/1437 | C: 178/1198
[LOSS Ex1] A: 0.66807 | B: 0.66444 | C: 0.66212
[LOGITS Ex2 A] Mean Abs: 1.631 | Max: 5.325
[LOSS Ex2] A: 0.18647 | B: 0.37597 | C: 0.31693
** [JOINT LOSS] ** : 0.958003
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003253 | Grad Max: 0.112730
  -> Layer: shared_layers.0.bias | Grad Mean: 0.050371 | Grad Max: 0.350805
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002000 | Grad Max: 0.007040
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006752 | Grad Max: 0.006752
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000495 | Grad Max: 0.125089
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007715 | Grad Max: 0.632772
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.003169
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001480 | Grad Max: 0.016426
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000185
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000284 | Grad Max: 0.001640
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000084
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000090 | Grad Max: 0.000574
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000510 | Grad Max: 0.001407
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000945 | Grad Max: 0.000945
[GRADIENT NORM TOTAL] 1.4470

[EPOCH SUMMARY] Train Loss: 0.9720

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9436 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9441 -> New: 0.9436)

############################## EPOCH 69/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.295
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.60154873 0.3984513 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.036
[MASKS] A(Pass/Fail): 581/1467 | B: 442/1606 | C: 291/1757
[LOSS Ex1] A: 0.66574 | B: 0.66375 | C: 0.65828
[LOGITS Ex2 A] Mean Abs: 1.617 | Max: 5.802
[LOSS Ex2] A: 0.20468 | B: 0.40841 | C: 0.30875
** [JOINT LOSS] ** : 0.969872
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003068 | Grad Max: 0.069562
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141796 | Grad Max: 0.568703
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.007413
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001472 | Grad Max: 0.001472
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000956 | Grad Max: 0.100141
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017456 | Grad Max: 0.569125
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000177 | Grad Max: 0.006235
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008136 | Grad Max: 0.037396
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000414
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001839 | Grad Max: 0.005095
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000148
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000549 | Grad Max: 0.001521
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000749 | Grad Max: 0.001984
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013213 | Grad Max: 0.013213
[GRADIENT NORM TOTAL] 2.6909

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.372
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5641892 0.4358108] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.531 | Std: 0.036
[MASKS] A(Pass/Fail): 481/1135 | B: 447/1601 | C: 274/1774
[LOSS Ex1] A: 0.66460 | B: 0.66430 | C: 0.65806
[LOGITS Ex2 A] Mean Abs: 1.692 | Max: 5.675
[LOSS Ex2] A: 0.18315 | B: 0.39885 | C: 0.30214
** [JOINT LOSS] ** : 0.957034
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002459 | Grad Max: 0.076015
  -> Layer: shared_layers.0.bias | Grad Mean: 0.201169 | Grad Max: 0.931369
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.007770
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003572 | Grad Max: 0.003572
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001322 | Grad Max: 0.134376
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024518 | Grad Max: 0.751826
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000233 | Grad Max: 0.008725
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010859 | Grad Max: 0.054676
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000421
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002378 | Grad Max: 0.005628
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000184
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000721 | Grad Max: 0.001906
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001014 | Grad Max: 0.002689
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019136 | Grad Max: 0.019136
[GRADIENT NORM TOTAL] 4.1162

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.496
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50601035 0.4939897 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.531 | Std: 0.037
[MASKS] A(Pass/Fail): 604/1444 | B: 462/1586 | C: 315/1733
[LOSS Ex1] A: 0.66477 | B: 0.66116 | C: 0.65495
[LOGITS Ex2 A] Mean Abs: 1.661 | Max: 5.855
[LOSS Ex2] A: 0.19272 | B: 0.38185 | C: 0.29911
** [JOINT LOSS] ** : 0.951523
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002326 | Grad Max: 0.052424
  -> Layer: shared_layers.0.bias | Grad Mean: 0.142668 | Grad Max: 0.613045
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.007056
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001925 | Grad Max: 0.001925
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001005 | Grad Max: 0.118986
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017809 | Grad Max: 0.671348
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000162 | Grad Max: 0.004507
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007414 | Grad Max: 0.030357
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000302
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001676 | Grad Max: 0.004123
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000150
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000518 | Grad Max: 0.001430
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000715 | Grad Max: 0.002483
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013931 | Grad Max: 0.013931
[GRADIENT NORM TOTAL] 2.9808

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.442
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5065246  0.49347535] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.531 | Std: 0.037
[MASKS] A(Pass/Fail): 588/1460 | B: 420/1436 | C: 266/1782
[LOSS Ex1] A: 0.66331 | B: 0.66426 | C: 0.65987
[LOGITS Ex2 A] Mean Abs: 1.639 | Max: 5.798
[LOSS Ex2] A: 0.17872 | B: 0.38583 | C: 0.32499
** [JOINT LOSS] ** : 0.958995
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002399 | Grad Max: 0.068024
  -> Layer: shared_layers.0.bias | Grad Mean: 0.196769 | Grad Max: 0.883930
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002154 | Grad Max: 0.008024
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007063 | Grad Max: 0.007063
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001325 | Grad Max: 0.238899
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024364 | Grad Max: 1.345304
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000247 | Grad Max: 0.009680
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011385 | Grad Max: 0.059641
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000436
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002422 | Grad Max: 0.005755
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000194
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000714 | Grad Max: 0.001894
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001020 | Grad Max: 0.002356
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017523 | Grad Max: 0.017523
[GRADIENT NORM TOTAL] 4.1863

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.455
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015182  0.49848184] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.531 | Std: 0.036
[MASKS] A(Pass/Fail): 593/1455 | B: 442/1606 | C: 288/1760
[LOSS Ex1] A: 0.66235 | B: 0.66356 | C: 0.65676
[LOGITS Ex2 A] Mean Abs: 1.624 | Max: 6.136
[LOSS Ex2] A: 0.21346 | B: 0.41391 | C: 0.32685
** [JOINT LOSS] ** : 0.978963
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003136 | Grad Max: 0.078773
  -> Layer: shared_layers.0.bias | Grad Mean: 0.176172 | Grad Max: 0.839682
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.008179
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005374 | Grad Max: 0.005374
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001279 | Grad Max: 0.275406
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022763 | Grad Max: 1.544173
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000212 | Grad Max: 0.007429
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009743 | Grad Max: 0.048180
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000352
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002043 | Grad Max: 0.004820
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000177
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000604 | Grad Max: 0.001552
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000910 | Grad Max: 0.001886
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015770 | Grad Max: 0.015770
[GRADIENT NORM TOTAL] 4.0155

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.385
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50440884 0.49559116] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.036
[MASKS] A(Pass/Fail): 557/1491 | B: 449/1599 | C: 249/1799
[LOSS Ex1] A: 0.66739 | B: 0.66411 | C: 0.66251
[LOGITS Ex2 A] Mean Abs: 1.643 | Max: 6.058
[LOSS Ex2] A: 0.18531 | B: 0.40608 | C: 0.33488
** [JOINT LOSS] ** : 0.973429
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002284 | Grad Max: 0.054663
  -> Layer: shared_layers.0.bias | Grad Mean: 0.125636 | Grad Max: 0.427099
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.006180
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002634 | Grad Max: 0.002634
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000900 | Grad Max: 0.168024
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016341 | Grad Max: 0.941334
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.003839
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005120 | Grad Max: 0.027220
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000284
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001124 | Grad Max: 0.003509
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000116
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000324 | Grad Max: 0.001091
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000348 | Grad Max: 0.001207
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006622 | Grad Max: 0.006622
[GRADIENT NORM TOTAL] 2.9104

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.308
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5269097  0.47309032] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.530 | Std: 0.035
[MASKS] A(Pass/Fail): 561/1487 | B: 463/1585 | C: 284/1764
[LOSS Ex1] A: 0.66707 | B: 0.66097 | C: 0.65946
[LOGITS Ex2 A] Mean Abs: 1.622 | Max: 5.907
[LOSS Ex2] A: 0.19377 | B: 0.39493 | C: 0.30766
** [JOINT LOSS] ** : 0.961288
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003058 | Grad Max: 0.097831
  -> Layer: shared_layers.0.bias | Grad Mean: 0.116428 | Grad Max: 0.547028
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.007520
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009039 | Grad Max: 0.009039
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000821 | Grad Max: 0.142731
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014695 | Grad Max: 0.752586
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000123 | Grad Max: 0.004341
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005516 | Grad Max: 0.026801
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000261
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001248 | Grad Max: 0.003397
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000363 | Grad Max: 0.001014
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000446 | Grad Max: 0.001721
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007928 | Grad Max: 0.007928
[GRADIENT NORM TOTAL] 2.4951

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.446
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6241763 0.3758237] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.532 | Std: 0.037
[MASKS] A(Pass/Fail): 621/1427 | B: 420/1436 | C: 244/1804
[LOSS Ex1] A: 0.66366 | B: 0.66407 | C: 0.66187
[LOGITS Ex2 A] Mean Abs: 1.658 | Max: 5.891
[LOSS Ex2] A: 0.18326 | B: 0.38552 | C: 0.29274
** [JOINT LOSS] ** : 0.950376
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.044591
  -> Layer: shared_layers.0.bias | Grad Mean: 0.086373 | Grad Max: 0.369956
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.007598
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006914 | Grad Max: 0.006914
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000621 | Grad Max: 0.080407
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011213 | Grad Max: 0.446311
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.004528
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005743 | Grad Max: 0.028925
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000269
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001246 | Grad Max: 0.003431
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000153
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000350 | Grad Max: 0.001096
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000555 | Grad Max: 0.001839
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007449 | Grad Max: 0.007449
[GRADIENT NORM TOTAL] 1.7256

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.499
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50009197 0.49990803] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.531 | Std: 0.036
[MASKS] A(Pass/Fail): 603/1445 | B: 443/1605 | C: 271/1777
[LOSS Ex1] A: 0.66768 | B: 0.66336 | C: 0.65949
[LOGITS Ex2 A] Mean Abs: 1.683 | Max: 5.361
[LOSS Ex2] A: 0.18353 | B: 0.40048 | C: 0.30611
** [JOINT LOSS] ** : 0.960216
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.060218
  -> Layer: shared_layers.0.bias | Grad Mean: 0.127937 | Grad Max: 0.627055
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.006986
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005746 | Grad Max: 0.005746
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000880 | Grad Max: 0.102083
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015926 | Grad Max: 0.571334
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000155 | Grad Max: 0.005865
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007072 | Grad Max: 0.038441
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000351
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001464 | Grad Max: 0.004120
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000125
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000425 | Grad Max: 0.001161
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000506 | Grad Max: 0.001833
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010188 | Grad Max: 0.010188
[GRADIENT NORM TOTAL] 2.5900

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.301
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.60350955 0.39649045] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.531 | Std: 0.037
[MASKS] A(Pass/Fail): 584/1464 | B: 452/1596 | C: 237/1811
[LOSS Ex1] A: 0.66529 | B: 0.66391 | C: 0.66273
[LOGITS Ex2 A] Mean Abs: 1.659 | Max: 5.755
[LOSS Ex2] A: 0.20525 | B: 0.40091 | C: 0.31191
** [JOINT LOSS] ** : 0.970002
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003177 | Grad Max: 0.099604
  -> Layer: shared_layers.0.bias | Grad Mean: 0.065118 | Grad Max: 0.263539
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.007820
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008072 | Grad Max: 0.008072
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000463 | Grad Max: 0.122020
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007699 | Grad Max: 0.674551
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000053 | Grad Max: 0.002912
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001763 | Grad Max: 0.016014
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000182
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000365 | Grad Max: 0.001884
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000083
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000113 | Grad Max: 0.000637
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000363 | Grad Max: 0.001323
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003314 | Grad Max: 0.003314
[GRADIENT NORM TOTAL] 1.5137

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.378
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5655113  0.43448865] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.532 | Std: 0.037
[MASKS] A(Pass/Fail): 481/1135 | B: 463/1585 | C: 271/1777
[LOSS Ex1] A: 0.66413 | B: 0.66075 | C: 0.65757
[LOGITS Ex2 A] Mean Abs: 1.715 | Max: 5.764
[LOSS Ex2] A: 0.18535 | B: 0.38578 | C: 0.31241
** [JOINT LOSS] ** : 0.955328
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001818 | Grad Max: 0.036117
  -> Layer: shared_layers.0.bias | Grad Mean: 0.071461 | Grad Max: 0.306810
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.007718
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000356 | Grad Max: 0.000356
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000533 | Grad Max: 0.108123
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009456 | Grad Max: 0.598181
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.003488
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002846 | Grad Max: 0.021683
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000255
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000651 | Grad Max: 0.002303
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000084
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000206 | Grad Max: 0.000849
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000445 | Grad Max: 0.001549
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006219 | Grad Max: 0.006219
[GRADIENT NORM TOTAL] 1.7418

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.501
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50605017 0.49394986] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.532 | Std: 0.038
[MASKS] A(Pass/Fail): 609/1439 | B: 423/1433 | C: 226/1822
[LOSS Ex1] A: 0.66428 | B: 0.66384 | C: 0.66316
[LOGITS Ex2 A] Mean Abs: 1.721 | Max: 7.003
[LOSS Ex2] A: 0.19068 | B: 0.38991 | C: 0.30749
** [JOINT LOSS] ** : 0.959787
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002281 | Grad Max: 0.052367
  -> Layer: shared_layers.0.bias | Grad Mean: 0.139276 | Grad Max: 0.570654
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.007882
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010767 | Grad Max: 0.010767
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000995 | Grad Max: 0.127268
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018383 | Grad Max: 0.720915
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.006482
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008106 | Grad Max: 0.041901
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000304
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001789 | Grad Max: 0.004156
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000139
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000552 | Grad Max: 0.001368
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000886 | Grad Max: 0.002517
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015339 | Grad Max: 0.015339
[GRADIENT NORM TOTAL] 2.9198

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.448
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068984  0.49310163] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.532 | Std: 0.037
[MASKS] A(Pass/Fail): 591/1457 | B: 443/1605 | C: 249/1799
[LOSS Ex1] A: 0.66277 | B: 0.66312 | C: 0.66081
[LOGITS Ex2 A] Mean Abs: 1.697 | Max: 6.256
[LOSS Ex2] A: 0.19039 | B: 0.40620 | C: 0.33048
** [JOINT LOSS] ** : 0.971252
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001861 | Grad Max: 0.040721
  -> Layer: shared_layers.0.bias | Grad Mean: 0.079661 | Grad Max: 0.462967
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.008109
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008413 | Grad Max: 0.008413
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000570 | Grad Max: 0.118512
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009816 | Grad Max: 0.659728
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.003641
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003189 | Grad Max: 0.018224
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000219
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000675 | Grad Max: 0.002448
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000087
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000204 | Grad Max: 0.000820
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000365 | Grad Max: 0.001246
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005690 | Grad Max: 0.005690
[GRADIENT NORM TOTAL] 1.8965

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.461
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50149864 0.4985014 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.532 | Std: 0.037
[MASKS] A(Pass/Fail): 599/1449 | B: 453/1595 | C: 172/1204
[LOSS Ex1] A: 0.66177 | B: 0.66366 | C: 0.65919
[LOGITS Ex2 A] Mean Abs: 1.695 | Max: 6.210
[LOSS Ex2] A: 0.20926 | B: 0.40087 | C: 0.29374
** [JOINT LOSS] ** : 0.962831
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004004 | Grad Max: 0.147046
  -> Layer: shared_layers.0.bias | Grad Mean: 0.064193 | Grad Max: 0.305326
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.007690
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000822 | Grad Max: 0.000822
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000609 | Grad Max: 0.112459
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009579 | Grad Max: 0.571090
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.002814
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002589 | Grad Max: 0.015524
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000228
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000617 | Grad Max: 0.002328
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000103
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000183 | Grad Max: 0.000695
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000279 | Grad Max: 0.001156
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004307 | Grad Max: 0.004307
[GRADIENT NORM TOTAL] 1.6749

[EPOCH SUMMARY] Train Loss: 0.9629

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9410 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9436 -> New: 0.9410)

############################## EPOCH 70/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.390
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5043737  0.49562627] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.530 | Std: 0.037
[MASKS] A(Pass/Fail): 559/1489 | B: 464/1584 | C: 292/1756
[LOSS Ex1] A: 0.66689 | B: 0.66047 | C: 0.65637
[LOGITS Ex2 A] Mean Abs: 1.679 | Max: 6.629
[LOSS Ex2] A: 0.17926 | B: 0.38056 | C: 0.30113
** [JOINT LOSS] ** : 0.948226
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002918 | Grad Max: 0.078839
  -> Layer: shared_layers.0.bias | Grad Mean: 0.067960 | Grad Max: 0.351085
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.007111
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008004 | Grad Max: 0.008004
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000650 | Grad Max: 0.044341
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011135 | Grad Max: 0.227211
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000119 | Grad Max: 0.005519
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005338 | Grad Max: 0.027655
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000268
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001215 | Grad Max: 0.003433
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000145
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000362 | Grad Max: 0.001286
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000565 | Grad Max: 0.001844
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009011 | Grad Max: 0.009011
[GRADIENT NORM TOTAL] 1.4869

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.314
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52765477 0.47234526] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.530 | Std: 0.036
[MASKS] A(Pass/Fail): 565/1483 | B: 424/1432 | C: 272/1776
[LOSS Ex1] A: 0.66656 | B: 0.66357 | C: 0.65763
[LOGITS Ex2 A] Mean Abs: 1.690 | Max: 6.233
[LOSS Ex2] A: 0.19388 | B: 0.37986 | C: 0.29004
** [JOINT LOSS] ** : 0.950516
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003438 | Grad Max: 0.078932
  -> Layer: shared_layers.0.bias | Grad Mean: 0.170396 | Grad Max: 0.759365
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.007323
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009887 | Grad Max: 0.009887
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001137 | Grad Max: 0.164427
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020954 | Grad Max: 0.861242
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000204 | Grad Max: 0.010716
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009338 | Grad Max: 0.066786
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000397
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002015 | Grad Max: 0.004782
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000169
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000600 | Grad Max: 0.001476
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000833 | Grad Max: 0.002307
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014945 | Grad Max: 0.014945
[GRADIENT NORM TOTAL] 3.4790

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.452
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6273321 0.3726679] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.533 | Std: 0.038
[MASKS] A(Pass/Fail): 628/1420 | B: 445/1603 | C: 261/1787
[LOSS Ex1] A: 0.66304 | B: 0.66284 | C: 0.65839
[LOGITS Ex2 A] Mean Abs: 1.729 | Max: 5.913
[LOSS Ex2] A: 0.18101 | B: 0.40280 | C: 0.31341
** [JOINT LOSS] ** : 0.960496
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002740 | Grad Max: 0.095527
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207622 | Grad Max: 1.017944
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.008349
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012926 | Grad Max: 0.012926
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001389 | Grad Max: 0.126448
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025657 | Grad Max: 0.703658
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000259 | Grad Max: 0.010623
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012085 | Grad Max: 0.073997
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000426
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002586 | Grad Max: 0.005886
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000191
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000768 | Grad Max: 0.001800
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001132 | Grad Max: 0.002497
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020261 | Grad Max: 0.020261
[GRADIENT NORM TOTAL] 4.1462

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.507
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000887 0.4999113] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.532 | Std: 0.037
[MASKS] A(Pass/Fail): 606/1442 | B: 487/1561 | C: 268/1780
[LOSS Ex1] A: 0.66714 | B: 0.66338 | C: 0.65932
[LOGITS Ex2 A] Mean Abs: 1.697 | Max: 5.839
[LOSS Ex2] A: 0.17968 | B: 0.40114 | C: 0.33524
** [JOINT LOSS] ** : 0.968634
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007740 | Grad Max: 0.260775
  -> Layer: shared_layers.0.bias | Grad Mean: 0.242399 | Grad Max: 1.014084
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002011 | Grad Max: 0.006520
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001608 | Grad Max: 0.001608
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001703 | Grad Max: 0.251326
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030812 | Grad Max: 1.406180
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000311 | Grad Max: 0.007571
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014085 | Grad Max: 0.051802
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000569
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003228 | Grad Max: 0.007086
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000275
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000956 | Grad Max: 0.002310
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001388 | Grad Max: 0.002579
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024222 | Grad Max: 0.024222
[GRADIENT NORM TOTAL] 4.7317

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.308
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6060719  0.39392814] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.532 | Std: 0.038
[MASKS] A(Pass/Fail): 588/1460 | B: 489/1559 | C: 252/1796
[LOSS Ex1] A: 0.66469 | B: 0.66019 | C: 0.65920
[LOGITS Ex2 A] Mean Abs: 1.686 | Max: 6.057
[LOSS Ex2] A: 0.19892 | B: 0.38493 | C: 0.32550
** [JOINT LOSS] ** : 0.964477
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005823 | Grad Max: 0.162444
  -> Layer: shared_layers.0.bias | Grad Mean: 0.238411 | Grad Max: 0.902657
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.007316
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001512 | Grad Max: 0.001512
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001579 | Grad Max: 0.253420
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029130 | Grad Max: 1.427959
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000289 | Grad Max: 0.009272
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013204 | Grad Max: 0.053885
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000493
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002959 | Grad Max: 0.006586
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000244
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000864 | Grad Max: 0.002160
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001343 | Grad Max: 0.002548
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022201 | Grad Max: 0.022201
[GRADIENT NORM TOTAL] 4.7101

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.385
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.56710505 0.43289497] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.533 | Std: 0.038
[MASKS] A(Pass/Fail): 485/1131 | B: 453/1403 | C: 241/1807
[LOSS Ex1] A: 0.66351 | B: 0.66331 | C: 0.66127
[LOGITS Ex2 A] Mean Abs: 1.758 | Max: 6.033
[LOSS Ex2] A: 0.19131 | B: 0.38165 | C: 0.32537
** [JOINT LOSS] ** : 0.962138
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003139 | Grad Max: 0.067808
  -> Layer: shared_layers.0.bias | Grad Mean: 0.129168 | Grad Max: 0.645741
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.007744
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006565 | Grad Max: 0.006565
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001040 | Grad Max: 0.134604
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019066 | Grad Max: 0.752246
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000191 | Grad Max: 0.008024
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008707 | Grad Max: 0.050116
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000350
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001878 | Grad Max: 0.004642
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000169
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000550 | Grad Max: 0.001493
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000836 | Grad Max: 0.001996
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014405 | Grad Max: 0.014405
[GRADIENT NORM TOTAL] 2.7877

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.509
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061049 0.4938951] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.533 | Std: 0.038
[MASKS] A(Pass/Fail): 613/1435 | B: 472/1576 | C: 265/1783
[LOSS Ex1] A: 0.66369 | B: 0.66258 | C: 0.65881
[LOGITS Ex2 A] Mean Abs: 1.738 | Max: 6.548
[LOSS Ex2] A: 0.18417 | B: 0.40382 | C: 0.29913
** [JOINT LOSS] ** : 0.957400
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003155 | Grad Max: 0.055287
  -> Layer: shared_layers.0.bias | Grad Mean: 0.060343 | Grad Max: 0.306848
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.007925
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009516 | Grad Max: 0.009516
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000628 | Grad Max: 0.154057
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010366 | Grad Max: 0.868654
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000094 | Grad Max: 0.005125
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003927 | Grad Max: 0.023539
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000239
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000877 | Grad Max: 0.002625
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000115
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000270 | Grad Max: 0.000857
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000433 | Grad Max: 0.001613
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007591 | Grad Max: 0.007591
[GRADIENT NORM TOTAL] 1.7130

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.455
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50720555 0.4927944 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.532 | Std: 0.038
[MASKS] A(Pass/Fail): 595/1453 | B: 487/1561 | C: 287/1761
[LOSS Ex1] A: 0.66214 | B: 0.66313 | C: 0.65830
[LOGITS Ex2 A] Mean Abs: 1.692 | Max: 6.190
[LOSS Ex2] A: 0.18474 | B: 0.40069 | C: 0.33228
** [JOINT LOSS] ** : 0.967092
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002656 | Grad Max: 0.092177
  -> Layer: shared_layers.0.bias | Grad Mean: 0.251638 | Grad Max: 1.127915
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.007541
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003154 | Grad Max: 0.003154
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001608 | Grad Max: 0.162825
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030136 | Grad Max: 0.916949
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000315 | Grad Max: 0.010883
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014812 | Grad Max: 0.073077
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000494
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003214 | Grad Max: 0.007290
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000269
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000945 | Grad Max: 0.002446
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001374 | Grad Max: 0.002681
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024366 | Grad Max: 0.024366
[GRADIENT NORM TOTAL] 4.9181

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.469
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015681  0.49843186] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.532 | Std: 0.038
[MASKS] A(Pass/Fail): 602/1446 | B: 491/1557 | C: 290/1758
[LOSS Ex1] A: 0.66114 | B: 0.65993 | C: 0.65597
[LOGITS Ex2 A] Mean Abs: 1.694 | Max: 7.182
[LOSS Ex2] A: 0.20706 | B: 0.38104 | C: 0.29480
** [JOINT LOSS] ** : 0.953313
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003254 | Grad Max: 0.098724
  -> Layer: shared_layers.0.bias | Grad Mean: 0.200522 | Grad Max: 0.863219
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002289 | Grad Max: 0.007636
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001362 | Grad Max: 0.001362
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001310 | Grad Max: 0.154537
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023073 | Grad Max: 0.844493
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.007155
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010249 | Grad Max: 0.049124
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000407
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002101 | Grad Max: 0.004872
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000166
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000590 | Grad Max: 0.001674
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000750 | Grad Max: 0.001983
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013691 | Grad Max: 0.013691
[GRADIENT NORM TOTAL] 3.8680

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.396
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044213  0.49557874] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.531 | Std: 0.038
[MASKS] A(Pass/Fail): 562/1486 | B: 455/1401 | C: 249/1799
[LOSS Ex1] A: 0.66636 | B: 0.66305 | C: 0.66188
[LOGITS Ex2 A] Mean Abs: 1.704 | Max: 5.856
[LOSS Ex2] A: 0.18009 | B: 0.37376 | C: 0.32381
** [JOINT LOSS] ** : 0.956318
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004197 | Grad Max: 0.119586
  -> Layer: shared_layers.0.bias | Grad Mean: 0.198567 | Grad Max: 0.881123
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001901 | Grad Max: 0.006310
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001646 | Grad Max: 0.001646
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001365 | Grad Max: 0.153054
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025096 | Grad Max: 0.843200
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000246 | Grad Max: 0.008104
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011315 | Grad Max: 0.055706
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000419
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002492 | Grad Max: 0.006245
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000184
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000726 | Grad Max: 0.001838
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000949 | Grad Max: 0.002402
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017794 | Grad Max: 0.017794
[GRADIENT NORM TOTAL] 4.0633

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.321
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5282064  0.47179353] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.531 | Std: 0.037
[MASKS] A(Pass/Fail): 567/1481 | B: 473/1575 | C: 270/1778
[LOSS Ex1] A: 0.66606 | B: 0.66233 | C: 0.65788
[LOGITS Ex2 A] Mean Abs: 1.673 | Max: 6.053
[LOSS Ex2] A: 0.19550 | B: 0.40913 | C: 0.30047
** [JOINT LOSS] ** : 0.963790
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003996 | Grad Max: 0.096086
  -> Layer: shared_layers.0.bias | Grad Mean: 0.236810 | Grad Max: 1.107232
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.007537
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004783 | Grad Max: 0.004783
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001520 | Grad Max: 0.161993
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028304 | Grad Max: 0.908366
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.009110
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014084 | Grad Max: 0.063130
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000540
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003067 | Grad Max: 0.007242
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000228
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000895 | Grad Max: 0.002331
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001205 | Grad Max: 0.002569
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022322 | Grad Max: 0.022322
[GRADIENT NORM TOTAL] 4.5401

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.457
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63011426 0.36988568] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.534 | Std: 0.039
[MASKS] A(Pass/Fail): 635/1413 | B: 489/1559 | C: 273/1775
[LOSS Ex1] A: 0.66249 | B: 0.66289 | C: 0.65743
[LOGITS Ex2 A] Mean Abs: 1.697 | Max: 6.326
[LOSS Ex2] A: 0.18284 | B: 0.39514 | C: 0.30452
** [JOINT LOSS] ** : 0.955105
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002837 | Grad Max: 0.057319
  -> Layer: shared_layers.0.bias | Grad Mean: 0.126809 | Grad Max: 0.539896
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.007788
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008628 | Grad Max: 0.008628
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000938 | Grad Max: 0.146875
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017268 | Grad Max: 0.819746
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.005798
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007896 | Grad Max: 0.036553
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000300
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001712 | Grad Max: 0.004438
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000153
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000488 | Grad Max: 0.001378
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000696 | Grad Max: 0.001833
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012042 | Grad Max: 0.012042
[GRADIENT NORM TOTAL] 2.6572

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.512
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000924  0.49990755] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.532 | Std: 0.038
[MASKS] A(Pass/Fail): 609/1439 | B: 493/1555 | C: 249/1799
[LOSS Ex1] A: 0.66668 | B: 0.65970 | C: 0.65968
[LOGITS Ex2 A] Mean Abs: 1.709 | Max: 6.132
[LOSS Ex2] A: 0.17856 | B: 0.38129 | C: 0.30454
** [JOINT LOSS] ** : 0.950150
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005037 | Grad Max: 0.159361
  -> Layer: shared_layers.0.bias | Grad Mean: 0.087559 | Grad Max: 0.416031
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.006747
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004422 | Grad Max: 0.004422
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000871 | Grad Max: 0.149495
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014876 | Grad Max: 0.808670
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.003786
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005471 | Grad Max: 0.024383
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000255
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001317 | Grad Max: 0.003639
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000127
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000388 | Grad Max: 0.001144
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000613 | Grad Max: 0.001867
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009510 | Grad Max: 0.009510
[GRADIENT NORM TOTAL] 2.2826

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.314
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.60823715 0.39176285] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.532 | Std: 0.039
[MASKS] A(Pass/Fail): 594/1454 | B: 455/1401 | C: 178/1198
[LOSS Ex1] A: 0.66420 | B: 0.66282 | C: 0.65918
[LOGITS Ex2 A] Mean Abs: 1.731 | Max: 5.938
[LOSS Ex2] A: 0.19781 | B: 0.37848 | C: 0.31893
** [JOINT LOSS] ** : 0.960474
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003021 | Grad Max: 0.069912
  -> Layer: shared_layers.0.bias | Grad Mean: 0.192755 | Grad Max: 0.741550
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.007232
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002070 | Grad Max: 0.002070
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001299 | Grad Max: 0.188306
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023559 | Grad Max: 1.060855
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000235 | Grad Max: 0.008622
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010940 | Grad Max: 0.055225
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000504
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002326 | Grad Max: 0.005682
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000202
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000677 | Grad Max: 0.001841
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001003 | Grad Max: 0.002627
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017731 | Grad Max: 0.017731
[GRADIENT NORM TOTAL] 4.0067

[EPOCH SUMMARY] Train Loss: 0.9584

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9376 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9410 -> New: 0.9376)

############################## EPOCH 71/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.391
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5684959  0.43150407] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.533 | Std: 0.039
[MASKS] A(Pass/Fail): 490/1126 | B: 474/1574 | C: 272/1776
[LOSS Ex1] A: 0.66302 | B: 0.66211 | C: 0.65677
[LOGITS Ex2 A] Mean Abs: 1.758 | Max: 5.598
[LOSS Ex2] A: 0.17072 | B: 0.40672 | C: 0.31028
** [JOINT LOSS] ** : 0.956538
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002948 | Grad Max: 0.077437
  -> Layer: shared_layers.0.bias | Grad Mean: 0.126719 | Grad Max: 0.547189
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.007205
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007358 | Grad Max: 0.007358
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000819 | Grad Max: 0.103437
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014646 | Grad Max: 0.543806
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.006398
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005729 | Grad Max: 0.032252
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000269
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001228 | Grad Max: 0.003534
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000113
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000347 | Grad Max: 0.001048
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000370 | Grad Max: 0.001343
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007321 | Grad Max: 0.007321
[GRADIENT NORM TOTAL] 2.4164

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.514
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50609773 0.4939023 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.533 | Std: 0.039
[MASKS] A(Pass/Fail): 618/1430 | B: 490/1558 | C: 278/1770
[LOSS Ex1] A: 0.66321 | B: 0.66269 | C: 0.65839
[LOGITS Ex2 A] Mean Abs: 1.721 | Max: 7.062
[LOSS Ex2] A: 0.18244 | B: 0.38814 | C: 0.30608
** [JOINT LOSS] ** : 0.953650
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002368 | Grad Max: 0.080588
  -> Layer: shared_layers.0.bias | Grad Mean: 0.193622 | Grad Max: 0.959441
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.006796
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000019 | Grad Max: 0.000019
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001298 | Grad Max: 0.180900
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024244 | Grad Max: 1.029200
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000258 | Grad Max: 0.008419
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012174 | Grad Max: 0.055366
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000456
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002642 | Grad Max: 0.006510
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000197
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000764 | Grad Max: 0.001812
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001081 | Grad Max: 0.002270
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019363 | Grad Max: 0.019363
[GRADIENT NORM TOTAL] 3.9649

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.461
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50750947 0.49249056] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.533 | Std: 0.039
[MASKS] A(Pass/Fail): 602/1446 | B: 494/1554 | C: 255/1793
[LOSS Ex1] A: 0.66162 | B: 0.65948 | C: 0.65971
[LOGITS Ex2 A] Mean Abs: 1.724 | Max: 5.995
[LOSS Ex2] A: 0.18287 | B: 0.38848 | C: 0.29001
** [JOINT LOSS] ** : 0.947391
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002950 | Grad Max: 0.073082
  -> Layer: shared_layers.0.bias | Grad Mean: 0.124668 | Grad Max: 0.473546
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002270 | Grad Max: 0.008046
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009028 | Grad Max: 0.009028
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000913 | Grad Max: 0.148651
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015776 | Grad Max: 0.821628
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.006875
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006515 | Grad Max: 0.035226
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000299
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001315 | Grad Max: 0.003679
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000128
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000376 | Grad Max: 0.001186
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000554 | Grad Max: 0.001518
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009039 | Grad Max: 0.009039
[GRADIENT NORM TOTAL] 2.6455

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.475
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50156957 0.49843037] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.533 | Std: 0.039
[MASKS] A(Pass/Fail): 609/1439 | B: 460/1396 | C: 282/1766
[LOSS Ex1] A: 0.66062 | B: 0.66261 | C: 0.65618
[LOGITS Ex2 A] Mean Abs: 1.746 | Max: 5.998
[LOSS Ex2] A: 0.20316 | B: 0.38150 | C: 0.33110
** [JOINT LOSS] ** : 0.965058
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005331 | Grad Max: 0.182884
  -> Layer: shared_layers.0.bias | Grad Mean: 0.203688 | Grad Max: 0.926736
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.008616
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009103 | Grad Max: 0.009103
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001572 | Grad Max: 0.193502
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028530 | Grad Max: 1.077740
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000276 | Grad Max: 0.008354
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012755 | Grad Max: 0.058944
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000516
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002854 | Grad Max: 0.006953
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000216
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000840 | Grad Max: 0.002071
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001224 | Grad Max: 0.002348
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020865 | Grad Max: 0.020865
[GRADIENT NORM TOTAL] 4.3424

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.401
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5043908  0.49560922] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.532 | Std: 0.038
[MASKS] A(Pass/Fail): 568/1480 | B: 477/1571 | C: 268/1780
[LOSS Ex1] A: 0.66594 | B: 0.66190 | C: 0.65758
[LOGITS Ex2 A] Mean Abs: 1.723 | Max: 6.039
[LOSS Ex2] A: 0.17875 | B: 0.40477 | C: 0.31786
** [JOINT LOSS] ** : 0.962270
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004370 | Grad Max: 0.111008
  -> Layer: shared_layers.0.bias | Grad Mean: 0.216595 | Grad Max: 1.080723
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002038 | Grad Max: 0.006217
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003124 | Grad Max: 0.003124
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001468 | Grad Max: 0.156772
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027254 | Grad Max: 0.878531
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000274 | Grad Max: 0.008442
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012685 | Grad Max: 0.062099
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000454
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002757 | Grad Max: 0.006373
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000203
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000802 | Grad Max: 0.002019
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001078 | Grad Max: 0.002463
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019793 | Grad Max: 0.019793
[GRADIENT NORM TOTAL] 4.2216

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.326
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5287489 0.4712511] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.531 | Std: 0.037
[MASKS] A(Pass/Fail): 570/1478 | B: 492/1556 | C: 274/1774
[LOSS Ex1] A: 0.66565 | B: 0.66249 | C: 0.65802
[LOGITS Ex2 A] Mean Abs: 1.671 | Max: 6.066
[LOSS Ex2] A: 0.19307 | B: 0.40170 | C: 0.29462
** [JOINT LOSS] ** : 0.958515
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004848 | Grad Max: 0.134589
  -> Layer: shared_layers.0.bias | Grad Mean: 0.232382 | Grad Max: 0.852169
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001993 | Grad Max: 0.006965
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000050 | Grad Max: 0.000050
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001499 | Grad Max: 0.200420
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027839 | Grad Max: 1.124529
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000287 | Grad Max: 0.009481
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013288 | Grad Max: 0.061274
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000511
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002926 | Grad Max: 0.006852
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000213
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000852 | Grad Max: 0.002024
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001317 | Grad Max: 0.002523
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021671 | Grad Max: 0.021671
[GRADIENT NORM TOTAL] 4.4608

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.463
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63251436 0.36748567] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.534 | Std: 0.039
[MASKS] A(Pass/Fail): 640/1408 | B: 497/1551 | C: 256/1792
[LOSS Ex1] A: 0.66202 | B: 0.65927 | C: 0.65986
[LOGITS Ex2 A] Mean Abs: 1.722 | Max: 6.390
[LOSS Ex2] A: 0.17889 | B: 0.37970 | C: 0.32848
** [JOINT LOSS] ** : 0.956074
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004805 | Grad Max: 0.145682
  -> Layer: shared_layers.0.bias | Grad Mean: 0.205079 | Grad Max: 0.761327
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.007662
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005794 | Grad Max: 0.005794
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001358 | Grad Max: 0.169091
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025037 | Grad Max: 0.930979
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000254 | Grad Max: 0.007886
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011757 | Grad Max: 0.060057
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000406
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002630 | Grad Max: 0.006283
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000211
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000774 | Grad Max: 0.001899
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001103 | Grad Max: 0.002144
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019510 | Grad Max: 0.019510
[GRADIENT NORM TOTAL] 3.8819

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.519
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000806 0.4999194] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.533 | Std: 0.039
[MASKS] A(Pass/Fail): 614/1434 | B: 462/1394 | C: 275/1773
[LOSS Ex1] A: 0.66628 | B: 0.66243 | C: 0.65572
[LOGITS Ex2 A] Mean Abs: 1.746 | Max: 5.956
[LOSS Ex2] A: 0.18064 | B: 0.37283 | C: 0.31274
** [JOINT LOSS] ** : 0.950210
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004300 | Grad Max: 0.146307
  -> Layer: shared_layers.0.bias | Grad Mean: 0.155161 | Grad Max: 0.647833
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.006634
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001715 | Grad Max: 0.001715
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001182 | Grad Max: 0.123889
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020247 | Grad Max: 0.672061
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.008856
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008321 | Grad Max: 0.051305
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000330
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001673 | Grad Max: 0.004314
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000148
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000478 | Grad Max: 0.001268
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000644 | Grad Max: 0.002018
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012354 | Grad Max: 0.012354
[GRADIENT NORM TOTAL] 3.2755

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.320
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61003166 0.3899683 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.533 | Std: 0.039
[MASKS] A(Pass/Fail): 597/1451 | B: 481/1567 | C: 258/1790
[LOSS Ex1] A: 0.66378 | B: 0.66172 | C: 0.65805
[LOGITS Ex2 A] Mean Abs: 1.745 | Max: 6.234
[LOSS Ex2] A: 0.19573 | B: 0.40121 | C: 0.28849
** [JOINT LOSS] ** : 0.956329
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002592 | Grad Max: 0.077092
  -> Layer: shared_layers.0.bias | Grad Mean: 0.205274 | Grad Max: 0.934262
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.007365
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006182 | Grad Max: 0.006182
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001358 | Grad Max: 0.145331
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025044 | Grad Max: 0.806665
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000258 | Grad Max: 0.008441
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012096 | Grad Max: 0.052618
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000481
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002574 | Grad Max: 0.006289
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000187
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000752 | Grad Max: 0.001889
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001110 | Grad Max: 0.002600
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019817 | Grad Max: 0.019817
[GRADIENT NORM TOTAL] 4.1253

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.396
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5696789 0.4303211] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.534 | Std: 0.039
[MASKS] A(Pass/Fail): 494/1122 | B: 495/1553 | C: 265/1783
[LOSS Ex1] A: 0.66260 | B: 0.66231 | C: 0.65860
[LOGITS Ex2 A] Mean Abs: 1.755 | Max: 6.541
[LOSS Ex2] A: 0.18216 | B: 0.40031 | C: 0.32481
** [JOINT LOSS] ** : 0.963599
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002757 | Grad Max: 0.078040
  -> Layer: shared_layers.0.bias | Grad Mean: 0.122765 | Grad Max: 0.537053
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.007122
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000618 | Grad Max: 0.000618
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000808 | Grad Max: 0.139410
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014633 | Grad Max: 0.761982
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.005446
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006624 | Grad Max: 0.038472
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000258
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001446 | Grad Max: 0.003615
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000125
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000434 | Grad Max: 0.001233
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000653 | Grad Max: 0.001350
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012073 | Grad Max: 0.012073
[GRADIENT NORM TOTAL] 2.4836

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.521
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061299  0.49387008] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.534 | Std: 0.040
[MASKS] A(Pass/Fail): 625/1423 | B: 498/1550 | C: 291/1757
[LOSS Ex1] A: 0.66279 | B: 0.65907 | C: 0.65665
[LOGITS Ex2 A] Mean Abs: 1.725 | Max: 5.367
[LOSS Ex2] A: 0.18690 | B: 0.38356 | C: 0.30666
** [JOINT LOSS] ** : 0.951875
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002328 | Grad Max: 0.053462
  -> Layer: shared_layers.0.bias | Grad Mean: 0.108052 | Grad Max: 0.452071
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002170 | Grad Max: 0.006830
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000135 | Grad Max: 0.000135
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000764 | Grad Max: 0.293678
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013540 | Grad Max: 1.652915
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000118 | Grad Max: 0.006123
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005463 | Grad Max: 0.039911
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000300
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001136 | Grad Max: 0.003761
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000119
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000322 | Grad Max: 0.001177
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000437 | Grad Max: 0.001468
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007881 | Grad Max: 0.007881
[GRADIENT NORM TOTAL] 2.8317

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.467
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50776315 0.4922368 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.534 | Std: 0.040
[MASKS] A(Pass/Fail): 605/1443 | B: 463/1393 | C: 267/1781
[LOSS Ex1] A: 0.66118 | B: 0.66224 | C: 0.65797
[LOGITS Ex2 A] Mean Abs: 1.757 | Max: 6.782
[LOSS Ex2] A: 0.17910 | B: 0.37239 | C: 0.29579
** [JOINT LOSS] ** : 0.942888
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003565 | Grad Max: 0.102970
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134763 | Grad Max: 0.486020
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.008179
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010279 | Grad Max: 0.010279
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001006 | Grad Max: 0.142499
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018174 | Grad Max: 0.736281
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.005273
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007509 | Grad Max: 0.034743
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000364
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001685 | Grad Max: 0.004300
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000150
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000506 | Grad Max: 0.001322
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000819 | Grad Max: 0.002430
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014140 | Grad Max: 0.014140
[GRADIENT NORM TOTAL] 2.9474

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.482
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50157166 0.49842837] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.534 | Std: 0.039
[MASKS] A(Pass/Fail): 614/1434 | B: 484/1564 | C: 256/1792
[LOSS Ex1] A: 0.66018 | B: 0.66153 | C: 0.65751
[LOGITS Ex2 A] Mean Abs: 1.719 | Max: 6.788
[LOSS Ex2] A: 0.19779 | B: 0.40387 | C: 0.30647
** [JOINT LOSS] ** : 0.962449
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002823 | Grad Max: 0.120234
  -> Layer: shared_layers.0.bias | Grad Mean: 0.068272 | Grad Max: 0.278382
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.008125
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004683 | Grad Max: 0.004683
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000638 | Grad Max: 0.186946
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010548 | Grad Max: 1.043570
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.003982
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003157 | Grad Max: 0.019583
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000248
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000775 | Grad Max: 0.002606
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000091
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000237 | Grad Max: 0.000703
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000346 | Grad Max: 0.001267
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006214 | Grad Max: 0.006214
[GRADIENT NORM TOTAL] 1.9948

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.405
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50437236 0.4956276 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.532 | Std: 0.039
[MASKS] A(Pass/Fail): 576/1472 | B: 498/1550 | C: 186/1190
[LOSS Ex1] A: 0.66558 | B: 0.66212 | C: 0.65731
[LOGITS Ex2 A] Mean Abs: 1.682 | Max: 5.706
[LOSS Ex2] A: 0.17459 | B: 0.40493 | C: 0.32830
** [JOINT LOSS] ** : 0.964279
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004864 | Grad Max: 0.141049
  -> Layer: shared_layers.0.bias | Grad Mean: 0.215183 | Grad Max: 0.882134
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.006086
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002387 | Grad Max: 0.002387
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001568 | Grad Max: 0.232661
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029392 | Grad Max: 1.294034
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000289 | Grad Max: 0.008021
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013425 | Grad Max: 0.054880
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000499
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002937 | Grad Max: 0.006658
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000233
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000857 | Grad Max: 0.002196
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001273 | Grad Max: 0.002454
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021996 | Grad Max: 0.021996
[GRADIENT NORM TOTAL] 4.5733

[EPOCH SUMMARY] Train Loss: 0.9565

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9360 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9376 -> New: 0.9360)

############################## EPOCH 72/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.331
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5292787  0.47072127] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.532 | Std: 0.038
[MASKS] A(Pass/Fail): 582/1466 | B: 499/1549 | C: 298/1750
[LOSS Ex1] A: 0.66529 | B: 0.65887 | C: 0.65679
[LOGITS Ex2 A] Mean Abs: 1.673 | Max: 6.934
[LOSS Ex2] A: 0.18471 | B: 0.38429 | C: 0.31148
** [JOINT LOSS] ** : 0.953812
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003323 | Grad Max: 0.108317
  -> Layer: shared_layers.0.bias | Grad Mean: 0.108771 | Grad Max: 0.374110
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.006779
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004990 | Grad Max: 0.004990
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000843 | Grad Max: 0.165471
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015168 | Grad Max: 0.928244
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000128 | Grad Max: 0.004662
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005712 | Grad Max: 0.027641
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000257
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001287 | Grad Max: 0.003712
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000113
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000377 | Grad Max: 0.001033
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000591 | Grad Max: 0.001613
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009406 | Grad Max: 0.009406
[GRADIENT NORM TOTAL] 2.5910

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.468
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6347861  0.36521387] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.040
[MASKS] A(Pass/Fail): 652/1396 | B: 465/1391 | C: 283/1765
[LOSS Ex1] A: 0.66160 | B: 0.66204 | C: 0.65597
[LOGITS Ex2 A] Mean Abs: 1.739 | Max: 6.632
[LOSS Ex2] A: 0.17872 | B: 0.37605 | C: 0.31157
** [JOINT LOSS] ** : 0.948652
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003652 | Grad Max: 0.101862
  -> Layer: shared_layers.0.bias | Grad Mean: 0.281203 | Grad Max: 1.253438
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.008291
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012997 | Grad Max: 0.012997
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001759 | Grad Max: 0.224583
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032815 | Grad Max: 1.241070
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000350 | Grad Max: 0.010870
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016692 | Grad Max: 0.079055
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000562
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003638 | Grad Max: 0.008055
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000232
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001060 | Grad Max: 0.002418
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001443 | Grad Max: 0.003467
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026605 | Grad Max: 0.026605
[GRADIENT NORM TOTAL] 5.4370

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.524
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50006914 0.49993086] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.533 | Std: 0.039
[MASKS] A(Pass/Fail): 623/1425 | B: 487/1561 | C: 251/1797
[LOSS Ex1] A: 0.66593 | B: 0.66133 | C: 0.65937
[LOGITS Ex2 A] Mean Abs: 1.743 | Max: 5.347
[LOSS Ex2] A: 0.18063 | B: 0.40770 | C: 0.32093
** [JOINT LOSS] ** : 0.965297
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003117 | Grad Max: 0.109055
  -> Layer: shared_layers.0.bias | Grad Mean: 0.298000 | Grad Max: 1.278504
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.006895
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006000 | Grad Max: 0.006000
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001894 | Grad Max: 0.232702
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035168 | Grad Max: 1.305956
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000362 | Grad Max: 0.011738
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017238 | Grad Max: 0.086764
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000579
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003623 | Grad Max: 0.008446
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000260
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001045 | Grad Max: 0.002575
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001397 | Grad Max: 0.002905
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026243 | Grad Max: 0.026243
[GRADIENT NORM TOTAL] 5.9075

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.325
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6118684  0.38813165] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.533 | Std: 0.040
[MASKS] A(Pass/Fail): 604/1444 | B: 502/1546 | C: 286/1762
[LOSS Ex1] A: 0.66340 | B: 0.66193 | C: 0.65629
[LOGITS Ex2 A] Mean Abs: 1.706 | Max: 6.380
[LOSS Ex2] A: 0.19727 | B: 0.40313 | C: 0.30234
** [JOINT LOSS] ** : 0.961454
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003115 | Grad Max: 0.081051
  -> Layer: shared_layers.0.bias | Grad Mean: 0.110506 | Grad Max: 0.522000
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.006522
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003347 | Grad Max: 0.003347
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000808 | Grad Max: 0.103885
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014349 | Grad Max: 0.533061
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000153 | Grad Max: 0.005025
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007008 | Grad Max: 0.028537
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000262
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001572 | Grad Max: 0.003862
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000132
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000476 | Grad Max: 0.001305
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000796 | Grad Max: 0.001794
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012953 | Grad Max: 0.012953
[GRADIENT NORM TOTAL] 2.1273

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.401
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5709001  0.42909992] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.040
[MASKS] A(Pass/Fail): 499/1117 | B: 500/1548 | C: 261/1787
[LOSS Ex1] A: 0.66220 | B: 0.65866 | C: 0.65906
[LOGITS Ex2 A] Mean Abs: 1.751 | Max: 6.624
[LOSS Ex2] A: 0.18149 | B: 0.38279 | C: 0.32900
** [JOINT LOSS] ** : 0.957736
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001665 | Grad Max: 0.029053
  -> Layer: shared_layers.0.bias | Grad Mean: 0.060860 | Grad Max: 0.246321
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.006807
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004908 | Grad Max: 0.004908
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000474 | Grad Max: 0.082093
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008093 | Grad Max: 0.460030
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.004107
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003131 | Grad Max: 0.019694
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000219
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000665 | Grad Max: 0.002533
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000090
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000198 | Grad Max: 0.000746
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000273 | Grad Max: 0.001068
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004834 | Grad Max: 0.004834
[GRADIENT NORM TOTAL] 1.3217

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.526
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061419 0.4938581] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.534 | Std: 0.040
[MASKS] A(Pass/Fail): 631/1417 | B: 468/1388 | C: 263/1785
[LOSS Ex1] A: 0.66239 | B: 0.66184 | C: 0.65779
[LOGITS Ex2 A] Mean Abs: 1.742 | Max: 6.678
[LOSS Ex2] A: 0.19240 | B: 0.37873 | C: 0.29137
** [JOINT LOSS] ** : 0.948169
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003673 | Grad Max: 0.083851
  -> Layer: shared_layers.0.bias | Grad Mean: 0.225707 | Grad Max: 0.945202
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.007181
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002302 | Grad Max: 0.002302
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001537 | Grad Max: 0.179279
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028803 | Grad Max: 0.991492
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.009628
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013077 | Grad Max: 0.066581
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000426
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002866 | Grad Max: 0.006463
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000217
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000857 | Grad Max: 0.002125
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001287 | Grad Max: 0.003387
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023201 | Grad Max: 0.023201
[GRADIENT NORM TOTAL] 4.5597

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.473
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5079855  0.49201453] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.534 | Std: 0.040
[MASKS] A(Pass/Fail): 607/1441 | B: 488/1560 | C: 250/1798
[LOSS Ex1] A: 0.66073 | B: 0.66113 | C: 0.65784
[LOGITS Ex2 A] Mean Abs: 1.736 | Max: 5.332
[LOSS Ex2] A: 0.18876 | B: 0.40695 | C: 0.27634
** [JOINT LOSS] ** : 0.950583
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004767 | Grad Max: 0.116188
  -> Layer: shared_layers.0.bias | Grad Mean: 0.246410 | Grad Max: 1.028237
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002258 | Grad Max: 0.008035
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011713 | Grad Max: 0.011713
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001679 | Grad Max: 0.156089
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031325 | Grad Max: 0.861045
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000311 | Grad Max: 0.010065
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014545 | Grad Max: 0.070290
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000554
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003175 | Grad Max: 0.007585
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000226
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000925 | Grad Max: 0.002259
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001254 | Grad Max: 0.002967
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023159 | Grad Max: 0.023159
[GRADIENT NORM TOTAL] 4.8239

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.488
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.501587   0.49841306] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.534 | Std: 0.040
[MASKS] A(Pass/Fail): 617/1431 | B: 504/1544 | C: 260/1788
[LOSS Ex1] A: 0.65972 | B: 0.66173 | C: 0.65605
[LOGITS Ex2 A] Mean Abs: 1.697 | Max: 6.129
[LOSS Ex2] A: 0.19386 | B: 0.40194 | C: 0.30604
** [JOINT LOSS] ** : 0.959780
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003124 | Grad Max: 0.062710
  -> Layer: shared_layers.0.bias | Grad Mean: 0.195525 | Grad Max: 0.841520
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.007510
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001212 | Grad Max: 0.001212
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001248 | Grad Max: 0.176952
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022521 | Grad Max: 0.978417
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.008388
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010487 | Grad Max: 0.058088
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000381
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002203 | Grad Max: 0.004873
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000155
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000618 | Grad Max: 0.001456
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000842 | Grad Max: 0.001970
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015263 | Grad Max: 0.015263
[GRADIENT NORM TOTAL] 3.8145

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.411
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5043904 0.4956096] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.533 | Std: 0.040
[MASKS] A(Pass/Fail): 578/1470 | B: 503/1545 | C: 268/1780
[LOSS Ex1] A: 0.66521 | B: 0.65845 | C: 0.65901
[LOGITS Ex2 A] Mean Abs: 1.668 | Max: 5.972
[LOSS Ex2] A: 0.17859 | B: 0.38637 | C: 0.30273
** [JOINT LOSS] ** : 0.950118
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002836 | Grad Max: 0.061505
  -> Layer: shared_layers.0.bias | Grad Mean: 0.171795 | Grad Max: 0.787494
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.006023
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004224 | Grad Max: 0.004224
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001239 | Grad Max: 0.146346
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022735 | Grad Max: 0.832709
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000235 | Grad Max: 0.008902
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011068 | Grad Max: 0.052146
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000413
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002394 | Grad Max: 0.005469
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000174
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000687 | Grad Max: 0.001753
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000995 | Grad Max: 0.002534
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017027 | Grad Max: 0.017027
[GRADIENT NORM TOTAL] 3.5688

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.337
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.52967405 0.47032598] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.532 | Std: 0.039
[MASKS] A(Pass/Fail): 583/1465 | B: 469/1387 | C: 284/1764
[LOSS Ex1] A: 0.66491 | B: 0.66164 | C: 0.65449
[LOGITS Ex2 A] Mean Abs: 1.671 | Max: 6.610
[LOSS Ex2] A: 0.19264 | B: 0.37756 | C: 0.32947
** [JOINT LOSS] ** : 0.960237
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003998 | Grad Max: 0.116749
  -> Layer: shared_layers.0.bias | Grad Mean: 0.177324 | Grad Max: 0.797929
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002120 | Grad Max: 0.006724
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004980 | Grad Max: 0.004980
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001232 | Grad Max: 0.144904
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022624 | Grad Max: 0.751381
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.008203
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010423 | Grad Max: 0.057257
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000389
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002298 | Grad Max: 0.005415
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000175
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000664 | Grad Max: 0.001589
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000836 | Grad Max: 0.001968
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015523 | Grad Max: 0.015523
[GRADIENT NORM TOTAL] 3.5850

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.473
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6369923  0.36300772] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.041
[MASKS] A(Pass/Fail): 654/1394 | B: 489/1559 | C: 288/1760
[LOSS Ex1] A: 0.66117 | B: 0.66094 | C: 0.65465
[LOGITS Ex2 A] Mean Abs: 1.720 | Max: 6.115
[LOSS Ex2] A: 0.16711 | B: 0.39581 | C: 0.29338
** [JOINT LOSS] ** : 0.944351
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.041499
  -> Layer: shared_layers.0.bias | Grad Mean: 0.084743 | Grad Max: 0.363287
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002238 | Grad Max: 0.007833
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007818 | Grad Max: 0.007818
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000718 | Grad Max: 0.110685
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012701 | Grad Max: 0.627129
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000118 | Grad Max: 0.004844
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005477 | Grad Max: 0.027689
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000254
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001208 | Grad Max: 0.003592
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000112
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000361 | Grad Max: 0.000988
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000507 | Grad Max: 0.001892
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009535 | Grad Max: 0.009535
[GRADIENT NORM TOTAL] 2.1304

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.530
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000474  0.49995264] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.534 | Std: 0.040
[MASKS] A(Pass/Fail): 629/1419 | B: 506/1542 | C: 268/1780
[LOSS Ex1] A: 0.66555 | B: 0.66154 | C: 0.65644
[LOGITS Ex2 A] Mean Abs: 1.704 | Max: 5.743
[LOSS Ex2] A: 0.17624 | B: 0.40173 | C: 0.29929
** [JOINT LOSS] ** : 0.953598
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005826 | Grad Max: 0.190989
  -> Layer: shared_layers.0.bias | Grad Mean: 0.252968 | Grad Max: 1.026355
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002031 | Grad Max: 0.006574
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003423 | Grad Max: 0.003423
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001740 | Grad Max: 0.197425
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031585 | Grad Max: 1.067955
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000316 | Grad Max: 0.009404
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014770 | Grad Max: 0.063298
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000507
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003337 | Grad Max: 0.007129
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000245
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000971 | Grad Max: 0.002408
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001356 | Grad Max: 0.002723
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023758 | Grad Max: 0.023758
[GRADIENT NORM TOTAL] 4.8743

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.330
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6135236  0.38647643] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.534 | Std: 0.041
[MASKS] A(Pass/Fail): 608/1440 | B: 504/1544 | C: 268/1780
[LOSS Ex1] A: 0.66298 | B: 0.65825 | C: 0.65690
[LOGITS Ex2 A] Mean Abs: 1.716 | Max: 5.878
[LOSS Ex2] A: 0.18972 | B: 0.38331 | C: 0.31088
** [JOINT LOSS] ** : 0.954010
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003780 | Grad Max: 0.092864
  -> Layer: shared_layers.0.bias | Grad Mean: 0.170660 | Grad Max: 0.758402
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002174 | Grad Max: 0.007554
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004585 | Grad Max: 0.004585
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001080 | Grad Max: 0.121937
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019732 | Grad Max: 0.619565
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000195 | Grad Max: 0.007754
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009200 | Grad Max: 0.051600
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000366
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002044 | Grad Max: 0.004993
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000163
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000578 | Grad Max: 0.001631
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000708 | Grad Max: 0.001723
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013410 | Grad Max: 0.013410
[GRADIENT NORM TOTAL] 3.1471

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.406
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5719     0.42809996] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.041
[MASKS] A(Pass/Fail): 502/1114 | B: 471/1385 | C: 174/1202
[LOSS Ex1] A: 0.66177 | B: 0.66144 | C: 0.65587
[LOGITS Ex2 A] Mean Abs: 1.779 | Max: 5.948
[LOSS Ex2] A: 0.17850 | B: 0.36575 | C: 0.29837
** [JOINT LOSS] ** : 0.940571
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.086433
  -> Layer: shared_layers.0.bias | Grad Mean: 0.198509 | Grad Max: 0.977351
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.006954
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001283 | Grad Max: 0.001283
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001323 | Grad Max: 0.147752
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024396 | Grad Max: 0.827373
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000234 | Grad Max: 0.008620
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011177 | Grad Max: 0.049957
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000384
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002407 | Grad Max: 0.005618
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000176
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000705 | Grad Max: 0.001637
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000912 | Grad Max: 0.002461
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017572 | Grad Max: 0.017572
[GRADIENT NORM TOTAL] 4.1565

[EPOCH SUMMARY] Train Loss: 0.9535

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9353 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9360 -> New: 0.9353)

############################## EPOCH 73/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.532
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062554 0.4937446] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.535 | Std: 0.041
[MASKS] A(Pass/Fail): 635/1413 | B: 490/1558 | C: 263/1785
[LOSS Ex1] A: 0.66198 | B: 0.66074 | C: 0.65550
[LOGITS Ex2 A] Mean Abs: 1.764 | Max: 8.323
[LOSS Ex2] A: 0.18478 | B: 0.40067 | C: 0.29184
** [JOINT LOSS] ** : 0.951840
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002935 | Grad Max: 0.074177
  -> Layer: shared_layers.0.bias | Grad Mean: 0.225713 | Grad Max: 0.982091
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.007444
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005924 | Grad Max: 0.005924
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001538 | Grad Max: 0.151925
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028775 | Grad Max: 0.851188
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.009525
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013797 | Grad Max: 0.068633
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000461
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003008 | Grad Max: 0.006850
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000215
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000878 | Grad Max: 0.002101
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001182 | Grad Max: 0.002772
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022154 | Grad Max: 0.022154
[GRADIENT NORM TOTAL] 4.6186

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.480
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50807905 0.49192095] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.535 | Std: 0.041
[MASKS] A(Pass/Fail): 609/1439 | B: 510/1538 | C: 250/1798
[LOSS Ex1] A: 0.66031 | B: 0.66134 | C: 0.65789
[LOGITS Ex2 A] Mean Abs: 1.728 | Max: 5.516
[LOSS Ex2] A: 0.18242 | B: 0.39621 | C: 0.32467
** [JOINT LOSS] ** : 0.960945
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001850 | Grad Max: 0.045483
  -> Layer: shared_layers.0.bias | Grad Mean: 0.040156 | Grad Max: 0.209246
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002325 | Grad Max: 0.008380
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015103 | Grad Max: 0.015103
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000437 | Grad Max: 0.119613
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007142 | Grad Max: 0.662676
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.003517
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001496 | Grad Max: 0.013644
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000134
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000241 | Grad Max: 0.001505
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000058
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000073 | Grad Max: 0.000471
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000188 | Grad Max: 0.000777
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001042 | Grad Max: 0.001042
[GRADIENT NORM TOTAL] 1.3585

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.495
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016087 0.4983913] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.535 | Std: 0.041
[MASKS] A(Pass/Fail): 622/1426 | B: 504/1544 | C: 254/1794
[LOSS Ex1] A: 0.65931 | B: 0.65804 | C: 0.65724
[LOGITS Ex2 A] Mean Abs: 1.690 | Max: 6.732
[LOSS Ex2] A: 0.20295 | B: 0.39099 | C: 0.29950
** [JOINT LOSS] ** : 0.956007
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003806 | Grad Max: 0.100134
  -> Layer: shared_layers.0.bias | Grad Mean: 0.258279 | Grad Max: 1.224548
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002313 | Grad Max: 0.008319
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009592 | Grad Max: 0.009592
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001689 | Grad Max: 0.346910
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031510 | Grad Max: 1.956380
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000297 | Grad Max: 0.008787
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014208 | Grad Max: 0.063214
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000463
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003000 | Grad Max: 0.006857
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000209
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000849 | Grad Max: 0.002216
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001141 | Grad Max: 0.002282
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020629 | Grad Max: 0.020629
[GRADIENT NORM TOTAL] 5.4977

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.416
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.504375   0.49562502] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.533 | Std: 0.040
[MASKS] A(Pass/Fail): 584/1464 | B: 473/1383 | C: 283/1765
[LOSS Ex1] A: 0.66486 | B: 0.66123 | C: 0.65559
[LOGITS Ex2 A] Mean Abs: 1.679 | Max: 5.880
[LOSS Ex2] A: 0.17868 | B: 0.37508 | C: 0.27429
** [JOINT LOSS] ** : 0.936578
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003993 | Grad Max: 0.098105
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207280 | Grad Max: 1.010794
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002004 | Grad Max: 0.006063
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002022 | Grad Max: 0.002022
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001468 | Grad Max: 0.246577
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027207 | Grad Max: 1.387417
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000270 | Grad Max: 0.009316
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012722 | Grad Max: 0.057906
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000466
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002769 | Grad Max: 0.006416
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000194
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000777 | Grad Max: 0.001956
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000994 | Grad Max: 0.002591
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017549 | Grad Max: 0.017549
[GRADIENT NORM TOTAL] 4.4122

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.342
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5300262  0.46997383] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.533 | Std: 0.039
[MASKS] A(Pass/Fail): 586/1462 | B: 491/1557 | C: 290/1758
[LOSS Ex1] A: 0.66458 | B: 0.66053 | C: 0.65626
[LOGITS Ex2 A] Mean Abs: 1.701 | Max: 5.874
[LOSS Ex2] A: 0.18775 | B: 0.40231 | C: 0.29261
** [JOINT LOSS] ** : 0.954680
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004413 | Grad Max: 0.115910
  -> Layer: shared_layers.0.bias | Grad Mean: 0.250935 | Grad Max: 0.903025
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001995 | Grad Max: 0.006628
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005240 | Grad Max: 0.005240
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001683 | Grad Max: 0.187139
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031425 | Grad Max: 1.045301
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000308 | Grad Max: 0.010065
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014695 | Grad Max: 0.071098
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000502
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003234 | Grad Max: 0.007175
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000237
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000939 | Grad Max: 0.002219
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001253 | Grad Max: 0.003177
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023251 | Grad Max: 0.023251
[GRADIENT NORM TOTAL] 5.0123

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.477
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63904685 0.36095318] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.041
[MASKS] A(Pass/Fail): 658/1390 | B: 510/1538 | C: 266/1782
[LOSS Ex1] A: 0.66079 | B: 0.66113 | C: 0.65668
[LOGITS Ex2 A] Mean Abs: 1.749 | Max: 6.672
[LOSS Ex2] A: 0.18968 | B: 0.40676 | C: 0.33208
** [JOINT LOSS] ** : 0.969044
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004413 | Grad Max: 0.132613
  -> Layer: shared_layers.0.bias | Grad Mean: 0.364266 | Grad Max: 1.736161
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002236 | Grad Max: 0.007855
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011474 | Grad Max: 0.011474
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002289 | Grad Max: 0.211380
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042839 | Grad Max: 1.187697
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000446 | Grad Max: 0.014465
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021496 | Grad Max: 0.103471
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000742
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004670 | Grad Max: 0.010675
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000308
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001350 | Grad Max: 0.003163
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001815 | Grad Max: 0.003645
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032853 | Grad Max: 0.032853
[GRADIENT NORM TOTAL] 7.1584

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.535
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50013083 0.4998692 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.534 | Std: 0.041
[MASKS] A(Pass/Fail): 632/1416 | B: 506/1542 | C: 264/1784
[LOSS Ex1] A: 0.66523 | B: 0.65783 | C: 0.65584
[LOGITS Ex2 A] Mean Abs: 1.726 | Max: 5.841
[LOSS Ex2] A: 0.17618 | B: 0.37404 | C: 0.29690
** [JOINT LOSS] ** : 0.942007
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002437 | Grad Max: 0.080562
  -> Layer: shared_layers.0.bias | Grad Mean: 0.065183 | Grad Max: 0.350468
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.006815
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005661 | Grad Max: 0.005661
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000606 | Grad Max: 0.082973
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010007 | Grad Max: 0.467036
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.003981
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003290 | Grad Max: 0.022472
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000225
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000619 | Grad Max: 0.002611
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000181 | Grad Max: 0.000823
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000556 | Grad Max: 0.001634
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004970 | Grad Max: 0.004970
[GRADIENT NORM TOTAL] 1.6231

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.335
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61513406 0.38486597] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.534 | Std: 0.041
[MASKS] A(Pass/Fail): 612/1436 | B: 473/1383 | C: 288/1760
[LOSS Ex1] A: 0.66264 | B: 0.66103 | C: 0.65544
[LOGITS Ex2 A] Mean Abs: 1.686 | Max: 6.209
[LOSS Ex2] A: 0.19207 | B: 0.39056 | C: 0.33784
** [JOINT LOSS] ** : 0.966529
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007493 | Grad Max: 0.176201
  -> Layer: shared_layers.0.bias | Grad Mean: 0.498383 | Grad Max: 2.076265
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002048 | Grad Max: 0.006531
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003602 | Grad Max: 0.003602
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003120 | Grad Max: 0.330746
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058721 | Grad Max: 1.842866
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000605 | Grad Max: 0.019010
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029023 | Grad Max: 0.131452
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000911
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006346 | Grad Max: 0.013403
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000388
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001827 | Grad Max: 0.004242
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002442 | Grad Max: 0.004547
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044303 | Grad Max: 0.044303
[GRADIENT NORM TOTAL] 9.2727

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.411
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57294804 0.427052  ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.041
[MASKS] A(Pass/Fail): 504/1112 | B: 493/1555 | C: 259/1789
[LOSS Ex1] A: 0.66143 | B: 0.66033 | C: 0.65875
[LOGITS Ex2 A] Mean Abs: 1.717 | Max: 5.686
[LOSS Ex2] A: 0.18481 | B: 0.43425 | C: 0.33937
** [JOINT LOSS] ** : 0.979647
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008092 | Grad Max: 0.206760
  -> Layer: shared_layers.0.bias | Grad Mean: 0.635454 | Grad Max: 2.666595
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.006802
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004852 | Grad Max: 0.004852
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003971 | Grad Max: 0.414810
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.075184 | Grad Max: 2.303638
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000778 | Grad Max: 0.023819
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037430 | Grad Max: 0.175566
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001201
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008124 | Grad Max: 0.016397
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000481
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002358 | Grad Max: 0.005279
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003282 | Grad Max: 0.006487
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.059285 | Grad Max: 0.059285
[GRADIENT NORM TOTAL] 12.2112

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.537
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50612056 0.4938794 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 637/1411 | B: 510/1538 | C: 257/1791
[LOSS Ex1] A: 0.66164 | B: 0.66094 | C: 0.65860
[LOGITS Ex2 A] Mean Abs: 1.716 | Max: 7.204
[LOSS Ex2] A: 0.18007 | B: 0.40679 | C: 0.30827
** [JOINT LOSS] ** : 0.958771
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004062 | Grad Max: 0.124543
  -> Layer: shared_layers.0.bias | Grad Mean: 0.358357 | Grad Max: 1.543433
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002041 | Grad Max: 0.006349
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001608 | Grad Max: 0.001608
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002223 | Grad Max: 0.247619
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042047 | Grad Max: 1.403629
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000447 | Grad Max: 0.015734
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021594 | Grad Max: 0.106267
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000706
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004617 | Grad Max: 0.010160
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000296
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001327 | Grad Max: 0.002923
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001863 | Grad Max: 0.003196
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033095 | Grad Max: 0.033095
[GRADIENT NORM TOTAL] 6.9557

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.486
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083471  0.49165288] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.535 | Std: 0.041
[MASKS] A(Pass/Fail): 612/1436 | B: 508/1540 | C: 278/1770
[LOSS Ex1] A: 0.65994 | B: 0.65763 | C: 0.65566
[LOGITS Ex2 A] Mean Abs: 1.774 | Max: 6.016
[LOSS Ex2] A: 0.18947 | B: 0.38606 | C: 0.31104
** [JOINT LOSS] ** : 0.953267
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006855 | Grad Max: 0.204276
  -> Layer: shared_layers.0.bias | Grad Mean: 0.331515 | Grad Max: 1.448843
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.007400
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002464 | Grad Max: 0.002464
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002290 | Grad Max: 0.231249
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042120 | Grad Max: 1.184505
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.011412
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019372 | Grad Max: 0.078371
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000646
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004300 | Grad Max: 0.009401
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000302
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001248 | Grad Max: 0.003100
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001709 | Grad Max: 0.003319
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030834 | Grad Max: 0.030834
[GRADIENT NORM TOTAL] 6.5553

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.500
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014658  0.49853417] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.535 | Std: 0.041
[MASKS] A(Pass/Fail): 623/1425 | B: 473/1383 | C: 283/1765
[LOSS Ex1] A: 0.65895 | B: 0.66085 | C: 0.65585
[LOGITS Ex2 A] Mean Abs: 1.770 | Max: 7.239
[LOSS Ex2] A: 0.22558 | B: 0.38419 | C: 0.32377
** [JOINT LOSS] ** : 0.969730
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010562 | Grad Max: 0.341549
  -> Layer: shared_layers.0.bias | Grad Mean: 0.557792 | Grad Max: 2.313211
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.007606
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000784 | Grad Max: 0.000784
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003841 | Grad Max: 0.346128
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070943 | Grad Max: 1.850459
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000702 | Grad Max: 0.018901
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033322 | Grad Max: 0.143137
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001075
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007385 | Grad Max: 0.016077
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000500
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002146 | Grad Max: 0.005283
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002919 | Grad Max: 0.005083
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052059 | Grad Max: 0.052059
[GRADIENT NORM TOTAL] 10.9466

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.420
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041726  0.49582738] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.534 | Std: 0.041
[MASKS] A(Pass/Fail): 590/1458 | B: 494/1554 | C: 281/1767
[LOSS Ex1] A: 0.66458 | B: 0.66015 | C: 0.65219
[LOGITS Ex2 A] Mean Abs: 1.753 | Max: 6.769
[LOSS Ex2] A: 0.18399 | B: 0.40133 | C: 0.30008
** [JOINT LOSS] ** : 0.954107
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007210 | Grad Max: 0.211014
  -> Layer: shared_layers.0.bias | Grad Mean: 0.362456 | Grad Max: 1.623585
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.006377
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001979 | Grad Max: 0.001979
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002375 | Grad Max: 0.238710
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043610 | Grad Max: 1.175551
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000427 | Grad Max: 0.012406
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020254 | Grad Max: 0.092614
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000758
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004530 | Grad Max: 0.010533
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000290
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001322 | Grad Max: 0.003107
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001756 | Grad Max: 0.003577
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032265 | Grad Max: 0.032265
[GRADIENT NORM TOTAL] 6.9576

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.346
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5304678  0.46953222] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.533 | Std: 0.040
[MASKS] A(Pass/Fail): 588/1460 | B: 510/1538 | C: 197/1179
[LOSS Ex1] A: 0.66430 | B: 0.66078 | C: 0.65424
[LOGITS Ex2 A] Mean Abs: 1.660 | Max: 6.278
[LOSS Ex2] A: 0.18396 | B: 0.40069 | C: 0.31378
** [JOINT LOSS] ** : 0.959250
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.071598
  -> Layer: shared_layers.0.bias | Grad Mean: 0.163129 | Grad Max: 0.862666
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002089 | Grad Max: 0.006609
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005358 | Grad Max: 0.005358
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001103 | Grad Max: 0.157590
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020034 | Grad Max: 0.883447
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000190 | Grad Max: 0.007440
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009252 | Grad Max: 0.053541
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000362
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001995 | Grad Max: 0.004881
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000156
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000585 | Grad Max: 0.001607
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000846 | Grad Max: 0.001777
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015312 | Grad Max: 0.015312
[GRADIENT NORM TOTAL] 3.4284

[EPOCH SUMMARY] Train Loss: 0.9580

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9346 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9353 -> New: 0.9346)

############################## EPOCH 74/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.480
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6408551  0.35914493] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.042
[MASKS] A(Pass/Fail): 660/1388 | B: 511/1537 | C: 264/1784
[LOSS Ex1] A: 0.66046 | B: 0.65747 | C: 0.65498
[LOGITS Ex2 A] Mean Abs: 1.710 | Max: 6.486
[LOSS Ex2] A: 0.16524 | B: 0.38674 | C: 0.29831
** [JOINT LOSS] ** : 0.941070
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003597 | Grad Max: 0.083479
  -> Layer: shared_layers.0.bias | Grad Mean: 0.254006 | Grad Max: 1.231914
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002275 | Grad Max: 0.007762
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008000 | Grad Max: 0.008000
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001598 | Grad Max: 0.182312
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029967 | Grad Max: 1.021170
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000312 | Grad Max: 0.011916
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014977 | Grad Max: 0.074023
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000581
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003232 | Grad Max: 0.007975
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000236
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000916 | Grad Max: 0.002447
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001180 | Grad Max: 0.002706
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021406 | Grad Max: 0.021406
[GRADIENT NORM TOTAL] 4.9766

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.539
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003218  0.49967813] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.041
[MASKS] A(Pass/Fail): 632/1416 | B: 473/1383 | C: 240/1808
[LOSS Ex1] A: 0.66496 | B: 0.66070 | C: 0.65884
[LOGITS Ex2 A] Mean Abs: 1.731 | Max: 5.907
[LOSS Ex2] A: 0.17575 | B: 0.36742 | C: 0.30575
** [JOINT LOSS] ** : 0.944471
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004261 | Grad Max: 0.135296
  -> Layer: shared_layers.0.bias | Grad Mean: 0.101943 | Grad Max: 0.404835
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.006967
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008970 | Grad Max: 0.008970
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000778 | Grad Max: 0.108192
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013339 | Grad Max: 0.553028
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000129 | Grad Max: 0.003638
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005890 | Grad Max: 0.026240
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000291
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001368 | Grad Max: 0.003755
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000127
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000395 | Grad Max: 0.001343
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000607 | Grad Max: 0.001780
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009547 | Grad Max: 0.009547
[GRADIENT NORM TOTAL] 2.0575

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.339
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6164469  0.38355315] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 613/1435 | B: 495/1553 | C: 257/1791
[LOSS Ex1] A: 0.66236 | B: 0.66000 | C: 0.65755
[LOGITS Ex2 A] Mean Abs: 1.773 | Max: 5.776
[LOSS Ex2] A: 0.19534 | B: 0.41343 | C: 0.31515
** [JOINT LOSS] ** : 0.967943
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005409 | Grad Max: 0.153562
  -> Layer: shared_layers.0.bias | Grad Mean: 0.454520 | Grad Max: 1.992226
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002038 | Grad Max: 0.006873
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000774 | Grad Max: 0.000774
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002926 | Grad Max: 0.275385
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055001 | Grad Max: 1.553204
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000549 | Grad Max: 0.017570
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026639 | Grad Max: 0.124385
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000844
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005765 | Grad Max: 0.012078
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000389
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001648 | Grad Max: 0.004146
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002090 | Grad Max: 0.003653
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039359 | Grad Max: 0.039359
[GRADIENT NORM TOTAL] 9.0802

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.415
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5737555 0.4262445] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.042
[MASKS] A(Pass/Fail): 506/1110 | B: 511/1537 | C: 291/1757
[LOSS Ex1] A: 0.66115 | B: 0.66062 | C: 0.65438
[LOGITS Ex2 A] Mean Abs: 1.820 | Max: 5.983
[LOSS Ex2] A: 0.18612 | B: 0.41808 | C: 0.33512
** [JOINT LOSS] ** : 0.971826
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007920 | Grad Max: 0.211429
  -> Layer: shared_layers.0.bias | Grad Mean: 0.638975 | Grad Max: 2.812143
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.007114
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001332 | Grad Max: 0.001332
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004114 | Grad Max: 0.400267
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.077609 | Grad Max: 2.170432
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000766 | Grad Max: 0.024477
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036912 | Grad Max: 0.172528
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000099 | Grad Max: 0.001190
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007983 | Grad Max: 0.017391
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000494
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002283 | Grad Max: 0.005344
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002967 | Grad Max: 0.005292
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053963 | Grad Max: 0.053963
[GRADIENT NORM TOTAL] 12.5498

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.541
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50609314 0.49390683] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.042
[MASKS] A(Pass/Fail): 639/1409 | B: 512/1536 | C: 274/1774
[LOSS Ex1] A: 0.66136 | B: 0.65731 | C: 0.65480
[LOGITS Ex2 A] Mean Abs: 1.794 | Max: 6.383
[LOSS Ex2] A: 0.17984 | B: 0.38492 | C: 0.30531
** [JOINT LOSS] ** : 0.947846
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003470 | Grad Max: 0.114626
  -> Layer: shared_layers.0.bias | Grad Mean: 0.336455 | Grad Max: 1.587149
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.006833
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000760 | Grad Max: 0.000760
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.268503
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040115 | Grad Max: 1.489549
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000379 | Grad Max: 0.014336
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018395 | Grad Max: 0.102598
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000562
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003971 | Grad Max: 0.008265
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000258
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001145 | Grad Max: 0.002570
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001496 | Grad Max: 0.003286
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027594 | Grad Max: 0.027594
[GRADIENT NORM TOTAL] 6.8119

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.490
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50853235 0.49146762] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 615/1433 | B: 473/1383 | C: 295/1753
[LOSS Ex1] A: 0.65964 | B: 0.66054 | C: 0.65496
[LOGITS Ex2 A] Mean Abs: 1.738 | Max: 5.971
[LOSS Ex2] A: 0.17516 | B: 0.37358 | C: 0.29120
** [JOINT LOSS] ** : 0.938360
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002468 | Grad Max: 0.074818
  -> Layer: shared_layers.0.bias | Grad Mean: 0.136415 | Grad Max: 0.511650
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.007674
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006742 | Grad Max: 0.006742
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000965 | Grad Max: 0.185011
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017244 | Grad Max: 1.032131
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000151 | Grad Max: 0.006486
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007119 | Grad Max: 0.040608
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000342
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001533 | Grad Max: 0.003912
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000155
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000437 | Grad Max: 0.001415
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000558 | Grad Max: 0.001749
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009818 | Grad Max: 0.009818
[GRADIENT NORM TOTAL] 3.1511

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.505
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014651  0.49853495] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 625/1423 | B: 496/1552 | C: 264/1784
[LOSS Ex1] A: 0.65866 | B: 0.65985 | C: 0.65654
[LOGITS Ex2 A] Mean Abs: 1.717 | Max: 6.797
[LOSS Ex2] A: 0.19030 | B: 0.41861 | C: 0.31825
** [JOINT LOSS] ** : 0.967401
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003448 | Grad Max: 0.152888
  -> Layer: shared_layers.0.bias | Grad Mean: 0.367461 | Grad Max: 1.876357
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.008010
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005072 | Grad Max: 0.005072
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002354 | Grad Max: 0.321593
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044251 | Grad Max: 1.799676
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000423 | Grad Max: 0.014086
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020538 | Grad Max: 0.097044
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000641
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004420 | Grad Max: 0.009901
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000286
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001283 | Grad Max: 0.003017
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001708 | Grad Max: 0.003246
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031574 | Grad Max: 0.031574
[GRADIENT NORM TOTAL] 7.7611

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.423
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040328 0.4959672] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.534 | Std: 0.041
[MASKS] A(Pass/Fail): 593/1455 | B: 511/1537 | C: 279/1769
[LOSS Ex1] A: 0.66434 | B: 0.66048 | C: 0.65623
[LOGITS Ex2 A] Mean Abs: 1.702 | Max: 6.472
[LOSS Ex2] A: 0.17792 | B: 0.39534 | C: 0.29356
** [JOINT LOSS] ** : 0.949288
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002514 | Grad Max: 0.071267
  -> Layer: shared_layers.0.bias | Grad Mean: 0.191097 | Grad Max: 0.903039
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002038 | Grad Max: 0.006075
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006312 | Grad Max: 0.006312
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001311 | Grad Max: 0.181819
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024141 | Grad Max: 0.986535
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.009265
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011761 | Grad Max: 0.065312
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000398
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002475 | Grad Max: 0.005443
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000182
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000705 | Grad Max: 0.001653
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000910 | Grad Max: 0.002126
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016882 | Grad Max: 0.016882
[GRADIENT NORM TOTAL] 4.1071

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.350
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5308369 0.4691631] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.534 | Std: 0.040
[MASKS] A(Pass/Fail): 589/1459 | B: 513/1535 | C: 279/1769
[LOSS Ex1] A: 0.66407 | B: 0.65716 | C: 0.65480
[LOGITS Ex2 A] Mean Abs: 1.713 | Max: 6.328
[LOSS Ex2] A: 0.18615 | B: 0.37939 | C: 0.29864
** [JOINT LOSS] ** : 0.946737
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005728 | Grad Max: 0.174757
  -> Layer: shared_layers.0.bias | Grad Mean: 0.266629 | Grad Max: 1.191052
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006538
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006526 | Grad Max: 0.006526
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001797 | Grad Max: 0.205774
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032988 | Grad Max: 1.040036
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000325 | Grad Max: 0.010173
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015516 | Grad Max: 0.067375
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000545
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003441 | Grad Max: 0.007668
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000242
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000994 | Grad Max: 0.002298
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001261 | Grad Max: 0.003077
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024109 | Grad Max: 0.024109
[GRADIENT NORM TOTAL] 5.2357

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.484
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64239913 0.35760087] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.042
[MASKS] A(Pass/Fail): 661/1387 | B: 473/1383 | C: 279/1769
[LOSS Ex1] A: 0.66018 | B: 0.66040 | C: 0.65452
[LOGITS Ex2 A] Mean Abs: 1.753 | Max: 6.385
[LOSS Ex2] A: 0.17720 | B: 0.37250 | C: 0.31691
** [JOINT LOSS] ** : 0.947238
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005437 | Grad Max: 0.145249
  -> Layer: shared_layers.0.bias | Grad Mean: 0.303123 | Grad Max: 1.389012
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002227 | Grad Max: 0.007765
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010381 | Grad Max: 0.010381
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.216676
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038423 | Grad Max: 1.162578
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000384 | Grad Max: 0.011669
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018452 | Grad Max: 0.087296
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000642
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004082 | Grad Max: 0.008853
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000276
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001195 | Grad Max: 0.002807
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001646 | Grad Max: 0.003687
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030009 | Grad Max: 0.030009
[GRADIENT NORM TOTAL] 6.0780

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.543
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004664 0.4995336] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 633/1415 | B: 497/1551 | C: 290/1758
[LOSS Ex1] A: 0.66472 | B: 0.65971 | C: 0.65225
[LOGITS Ex2 A] Mean Abs: 1.732 | Max: 6.042
[LOSS Ex2] A: 0.17282 | B: 0.40741 | C: 0.28893
** [JOINT LOSS] ** : 0.948610
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.071667
  -> Layer: shared_layers.0.bias | Grad Mean: 0.046596 | Grad Max: 0.221297
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.006677
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003201 | Grad Max: 0.003201
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000459 | Grad Max: 0.062611
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007468 | Grad Max: 0.346441
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.003803
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002185 | Grad Max: 0.023501
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000155
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000342 | Grad Max: 0.001832
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000065
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000091 | Grad Max: 0.000521
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000361 | Grad Max: 0.001045
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002013 | Grad Max: 0.002013
[GRADIENT NORM TOTAL] 1.1469

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.343
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61766315 0.38233688] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 615/1433 | B: 514/1534 | C: 272/1776
[LOSS Ex1] A: 0.66210 | B: 0.66034 | C: 0.65599
[LOGITS Ex2 A] Mean Abs: 1.696 | Max: 5.673
[LOSS Ex2] A: 0.19174 | B: 0.40773 | C: 0.29849
** [JOINT LOSS] ** : 0.958800
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005719 | Grad Max: 0.136227
  -> Layer: shared_layers.0.bias | Grad Mean: 0.392534 | Grad Max: 1.762053
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.007040
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003156 | Grad Max: 0.003156
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002486 | Grad Max: 0.246528
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046750 | Grad Max: 1.351282
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000466 | Grad Max: 0.013054
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022481 | Grad Max: 0.092767
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000703
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004897 | Grad Max: 0.010607
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000315
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001416 | Grad Max: 0.003455
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001886 | Grad Max: 0.003480
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034566 | Grad Max: 0.034566
[GRADIENT NORM TOTAL] 7.4744

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.419
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57454455 0.4254554 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.042
[MASKS] A(Pass/Fail): 507/1109 | B: 513/1535 | C: 267/1781
[LOSS Ex1] A: 0.66088 | B: 0.65701 | C: 0.65590
[LOGITS Ex2 A] Mean Abs: 1.732 | Max: 6.000
[LOSS Ex2] A: 0.17488 | B: 0.38857 | C: 0.30512
** [JOINT LOSS] ** : 0.947453
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005670 | Grad Max: 0.146740
  -> Layer: shared_layers.0.bias | Grad Mean: 0.434910 | Grad Max: 1.991657
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002219 | Grad Max: 0.006933
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002844 | Grad Max: 0.002844
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002783 | Grad Max: 0.343451
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052521 | Grad Max: 1.929182
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000525 | Grad Max: 0.016084
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025470 | Grad Max: 0.118153
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000740
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005562 | Grad Max: 0.011175
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000364
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001611 | Grad Max: 0.003719
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002229 | Grad Max: 0.003962
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039698 | Grad Max: 0.039698
[GRADIENT NORM TOTAL] 8.6660

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.545
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50607413 0.4939259 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 639/1409 | B: 475/1381 | C: 166/1210
[LOSS Ex1] A: 0.66109 | B: 0.66025 | C: 0.65683
[LOGITS Ex2 A] Mean Abs: 1.741 | Max: 6.029
[LOSS Ex2] A: 0.17784 | B: 0.37340 | C: 0.30609
** [JOINT LOSS] ** : 0.945169
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003330 | Grad Max: 0.067401
  -> Layer: shared_layers.0.bias | Grad Mean: 0.179443 | Grad Max: 0.837595
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002049 | Grad Max: 0.006479
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000997 | Grad Max: 0.000997
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001167 | Grad Max: 0.099099
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021414 | Grad Max: 0.556341
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000231 | Grad Max: 0.008204
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011051 | Grad Max: 0.054507
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000323
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002373 | Grad Max: 0.005391
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000167
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000686 | Grad Max: 0.001668
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000988 | Grad Max: 0.002230
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017387 | Grad Max: 0.017387
[GRADIENT NORM TOTAL] 3.4015

[EPOCH SUMMARY] Train Loss: 0.9516

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9420 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 75/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.493
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.508659   0.49134097] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.042
[MASKS] A(Pass/Fail): 617/1431 | B: 498/1550 | C: 267/1781
[LOSS Ex1] A: 0.65934 | B: 0.65955 | C: 0.65470
[LOGITS Ex2 A] Mean Abs: 1.805 | Max: 5.753
[LOSS Ex2] A: 0.19245 | B: 0.40734 | C: 0.30279
** [JOINT LOSS] ** : 0.958725
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008820 | Grad Max: 0.271262
  -> Layer: shared_layers.0.bias | Grad Mean: 0.477312 | Grad Max: 1.967880
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.008113
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006325 | Grad Max: 0.006325
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003193 | Grad Max: 0.325503
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059155 | Grad Max: 1.693394
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000574 | Grad Max: 0.015654
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027553 | Grad Max: 0.107040
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000827
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006126 | Grad Max: 0.012922
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000406
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001774 | Grad Max: 0.004224
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002341 | Grad Max: 0.004136
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042818 | Grad Max: 0.042818
[GRADIENT NORM TOTAL] 9.2480

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.509
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014511  0.49854892] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.042
[MASKS] A(Pass/Fail): 625/1423 | B: 515/1533 | C: 268/1780
[LOSS Ex1] A: 0.65837 | B: 0.66019 | C: 0.65597
[LOGITS Ex2 A] Mean Abs: 1.800 | Max: 6.933
[LOSS Ex2] A: 0.21913 | B: 0.41410 | C: 0.32946
** [JOINT LOSS] ** : 0.979073
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012172 | Grad Max: 0.378636
  -> Layer: shared_layers.0.bias | Grad Mean: 0.774389 | Grad Max: 3.326923
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.007287
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000460 | Grad Max: 0.000460
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005184 | Grad Max: 0.480590
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.095890 | Grad Max: 2.603522
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000935 | Grad Max: 0.024772
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.045187 | Grad Max: 0.188484
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000122 | Grad Max: 0.001413
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009996 | Grad Max: 0.020998
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000669
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002905 | Grad Max: 0.007066
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003913 | Grad Max: 0.006739
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.070738 | Grad Max: 0.070738
[GRADIENT NORM TOTAL] 15.2219

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.427
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50401217 0.4959878 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.534 | Std: 0.041
[MASKS] A(Pass/Fail): 595/1453 | B: 514/1534 | C: 280/1768
[LOSS Ex1] A: 0.66410 | B: 0.65685 | C: 0.65526
[LOGITS Ex2 A] Mean Abs: 1.752 | Max: 5.973
[LOSS Ex2] A: 0.18649 | B: 0.40208 | C: 0.31173
** [JOINT LOSS] ** : 0.958838
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010126 | Grad Max: 0.280760
  -> Layer: shared_layers.0.bias | Grad Mean: 0.611941 | Grad Max: 2.700320
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.006025
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003677 | Grad Max: 0.003677
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004008 | Grad Max: 0.420772
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074347 | Grad Max: 2.284436
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000730 | Grad Max: 0.022831
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035232 | Grad Max: 0.169613
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000096 | Grad Max: 0.001083
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007784 | Grad Max: 0.016435
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000497
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002258 | Grad Max: 0.005344
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002911 | Grad Max: 0.005272
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053901 | Grad Max: 0.053901
[GRADIENT NORM TOTAL] 11.7868

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.354
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53107405 0.46892592] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.534 | Std: 0.041
[MASKS] A(Pass/Fail): 591/1457 | B: 477/1379 | C: 276/1772
[LOSS Ex1] A: 0.66383 | B: 0.66012 | C: 0.65451
[LOGITS Ex2 A] Mean Abs: 1.697 | Max: 6.168
[LOSS Ex2] A: 0.18210 | B: 0.36551 | C: 0.31009
** [JOINT LOSS] ** : 0.945387
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004310 | Grad Max: 0.137336
  -> Layer: shared_layers.0.bias | Grad Mean: 0.140868 | Grad Max: 0.531879
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.006652
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000845 | Grad Max: 0.000845
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001047 | Grad Max: 0.114490
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018817 | Grad Max: 0.638174
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000185 | Grad Max: 0.005913
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008803 | Grad Max: 0.041494
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000361
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001986 | Grad Max: 0.004848
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000147
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000591 | Grad Max: 0.001431
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000827 | Grad Max: 0.002231
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015350 | Grad Max: 0.015350
[GRADIENT NORM TOTAL] 2.8297

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.488
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6437795  0.35622048] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.043
[MASKS] A(Pass/Fail): 662/1386 | B: 501/1547 | C: 288/1760
[LOSS Ex1] A: 0.65991 | B: 0.65942 | C: 0.65390
[LOGITS Ex2 A] Mean Abs: 1.697 | Max: 5.945
[LOSS Ex2] A: 0.17845 | B: 0.42957 | C: 0.31953
** [JOINT LOSS] ** : 0.966928
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009328 | Grad Max: 0.212218
  -> Layer: shared_layers.0.bias | Grad Mean: 0.651825 | Grad Max: 2.795566
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.007801
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011470 | Grad Max: 0.011470
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004169 | Grad Max: 0.465280
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.078541 | Grad Max: 2.603276
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000774 | Grad Max: 0.022486
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037439 | Grad Max: 0.163317
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001072
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008154 | Grad Max: 0.016661
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000529
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002365 | Grad Max: 0.005594
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003125 | Grad Max: 0.005947
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057299 | Grad Max: 0.057299
[GRADIENT NORM TOTAL] 12.7624

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.548
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50044197 0.49955803] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 634/1414 | B: 516/1532 | C: 258/1790
[LOSS Ex1] A: 0.66448 | B: 0.66007 | C: 0.65637
[LOGITS Ex2 A] Mean Abs: 1.656 | Max: 6.024
[LOSS Ex2] A: 0.19776 | B: 0.45858 | C: 0.34740
** [JOINT LOSS] ** : 0.994887
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.016048 | Grad Max: 0.419653
  -> Layer: shared_layers.0.bias | Grad Mean: 0.975248 | Grad Max: 4.135005
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.006548
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005007 | Grad Max: 0.005007
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006269 | Grad Max: 0.655250
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.117815 | Grad Max: 3.560012
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001182 | Grad Max: 0.032413
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.057244 | Grad Max: 0.248688
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000155 | Grad Max: 0.001610
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012596 | Grad Max: 0.025734
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000067 | Grad Max: 0.000737
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003670 | Grad Max: 0.008140
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004768 | Grad Max: 0.009722
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.089243 | Grad Max: 0.089243
[GRADIENT NORM TOTAL] 18.5665

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.346
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61865026 0.38134974] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 618/1430 | B: 515/1533 | C: 264/1784
[LOSS Ex1] A: 0.66185 | B: 0.65673 | C: 0.65584
[LOGITS Ex2 A] Mean Abs: 1.632 | Max: 6.375
[LOSS Ex2] A: 0.21864 | B: 0.44103 | C: 0.33473
** [JOINT LOSS] ** : 0.989605
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014827 | Grad Max: 0.386849
  -> Layer: shared_layers.0.bias | Grad Mean: 0.892660 | Grad Max: 3.725726
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.007030
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003850 | Grad Max: 0.003850
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005756 | Grad Max: 0.612693
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.108732 | Grad Max: 3.292723
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001098 | Grad Max: 0.030558
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.053023 | Grad Max: 0.222325
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000144 | Grad Max: 0.001551
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011610 | Grad Max: 0.022983
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000062 | Grad Max: 0.000698
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003370 | Grad Max: 0.007619
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004493 | Grad Max: 0.009296
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.081940 | Grad Max: 0.081940
[GRADIENT NORM TOTAL] 16.9552

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.422
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5751543  0.42484573] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 507/1109 | B: 480/1376 | C: 261/1787
[LOSS Ex1] A: 0.66064 | B: 0.66001 | C: 0.65597
[LOGITS Ex2 A] Mean Abs: 1.725 | Max: 6.173
[LOSS Ex2] A: 0.19389 | B: 0.39934 | C: 0.31714
** [JOINT LOSS] ** : 0.962329
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010300 | Grad Max: 0.278950
  -> Layer: shared_layers.0.bias | Grad Mean: 0.598087 | Grad Max: 2.506990
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.007043
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001452 | Grad Max: 0.001452
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003765 | Grad Max: 0.384322
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070627 | Grad Max: 2.077984
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000716 | Grad Max: 0.021713
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034564 | Grad Max: 0.160348
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.001016
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007614 | Grad Max: 0.015445
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000470
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002213 | Grad Max: 0.005139
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003030 | Grad Max: 0.005449
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.054420 | Grad Max: 0.054420
[GRADIENT NORM TOTAL] 11.1195

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.549
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5060636  0.49393642] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 640/1408 | B: 502/1546 | C: 270/1778
[LOSS Ex1] A: 0.66086 | B: 0.65931 | C: 0.65571
[LOGITS Ex2 A] Mean Abs: 1.749 | Max: 6.536
[LOSS Ex2] A: 0.17463 | B: 0.40185 | C: 0.30565
** [JOINT LOSS] ** : 0.952668
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002675 | Grad Max: 0.094939
  -> Layer: shared_layers.0.bias | Grad Mean: 0.133546 | Grad Max: 0.708245
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002107 | Grad Max: 0.006526
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003555 | Grad Max: 0.003555
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000934 | Grad Max: 0.110972
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016206 | Grad Max: 0.626096
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000130 | Grad Max: 0.005809
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006280 | Grad Max: 0.039229
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000289
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001335 | Grad Max: 0.003828
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000129
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000388 | Grad Max: 0.001138
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000448 | Grad Max: 0.001570
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009081 | Grad Max: 0.009081
[GRADIENT NORM TOTAL] 2.7678

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.497
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5086708 0.4913292] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 618/1430 | B: 517/1531 | C: 251/1797
[LOSS Ex1] A: 0.65911 | B: 0.65996 | C: 0.65707
[LOGITS Ex2 A] Mean Abs: 1.773 | Max: 5.868
[LOSS Ex2] A: 0.19169 | B: 0.41335 | C: 0.30100
** [JOINT LOSS] ** : 0.960726
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005918 | Grad Max: 0.203445
  -> Layer: shared_layers.0.bias | Grad Mean: 0.556879 | Grad Max: 2.621548
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.007242
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003972 | Grad Max: 0.003972
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003513 | Grad Max: 0.343868
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.066460 | Grad Max: 1.932683
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000646 | Grad Max: 0.021781
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031757 | Grad Max: 0.154622
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000982
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006896 | Grad Max: 0.015295
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000413
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002004 | Grad Max: 0.004744
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002550 | Grad Max: 0.004479
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048200 | Grad Max: 0.048200
[GRADIENT NORM TOTAL] 11.0775

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.512
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014629  0.49853712] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.042
[MASKS] A(Pass/Fail): 625/1423 | B: 517/1531 | C: 285/1763
[LOSS Ex1] A: 0.65814 | B: 0.65662 | C: 0.65299
[LOGITS Ex2 A] Mean Abs: 1.773 | Max: 6.435
[LOSS Ex2] A: 0.19547 | B: 0.40192 | C: 0.32126
** [JOINT LOSS] ** : 0.962130
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007180 | Grad Max: 0.197197
  -> Layer: shared_layers.0.bias | Grad Mean: 0.587957 | Grad Max: 2.528712
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002351 | Grad Max: 0.008007
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006085 | Grad Max: 0.006085
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003853 | Grad Max: 0.374077
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.072151 | Grad Max: 2.090177
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000726 | Grad Max: 0.022301
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035320 | Grad Max: 0.157819
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001122
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007703 | Grad Max: 0.017093
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000481
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002245 | Grad Max: 0.005441
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002882 | Grad Max: 0.005064
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053870 | Grad Max: 0.053870
[GRADIENT NORM TOTAL] 11.7154

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.430
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040271  0.49597284] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.534 | Std: 0.042
[MASKS] A(Pass/Fail): 595/1453 | B: 482/1374 | C: 269/1779
[LOSS Ex1] A: 0.66390 | B: 0.65991 | C: 0.65530
[LOGITS Ex2 A] Mean Abs: 1.718 | Max: 6.244
[LOSS Ex2] A: 0.17651 | B: 0.36980 | C: 0.29667
** [JOINT LOSS] ** : 0.940695
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003683 | Grad Max: 0.094974
  -> Layer: shared_layers.0.bias | Grad Mean: 0.249495 | Grad Max: 1.116526
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.006212
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000846 | Grad Max: 0.000846
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001664 | Grad Max: 0.171817
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030739 | Grad Max: 0.952684
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000301 | Grad Max: 0.010627
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014734 | Grad Max: 0.071975
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000455
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003276 | Grad Max: 0.007032
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000224
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000981 | Grad Max: 0.002269
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001274 | Grad Max: 0.003225
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025040 | Grad Max: 0.025040
[GRADIENT NORM TOTAL] 5.0446

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.356
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5312386  0.46876144] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.534 | Std: 0.041
[MASKS] A(Pass/Fail): 594/1454 | B: 502/1546 | C: 283/1765
[LOSS Ex1] A: 0.66364 | B: 0.65921 | C: 0.65466
[LOGITS Ex2 A] Mean Abs: 1.651 | Max: 6.387
[LOSS Ex2] A: 0.18612 | B: 0.41765 | C: 0.29993
** [JOINT LOSS] ** : 0.960405
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005341 | Grad Max: 0.137628
  -> Layer: shared_layers.0.bias | Grad Mean: 0.462723 | Grad Max: 1.927409
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.007046
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008204 | Grad Max: 0.008204
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002895 | Grad Max: 0.317643
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054530 | Grad Max: 1.777605
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000551 | Grad Max: 0.016008
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026951 | Grad Max: 0.117312
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000756
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005900 | Grad Max: 0.012067
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000388
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001717 | Grad Max: 0.004236
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002167 | Grad Max: 0.003803
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040985 | Grad Max: 0.040985
[GRADIENT NORM TOTAL] 9.0132

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.491
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64482635 0.35517365] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.043
[MASKS] A(Pass/Fail): 663/1385 | B: 517/1531 | C: 208/1168
[LOSS Ex1] A: 0.65971 | B: 0.65987 | C: 0.65185
[LOGITS Ex2 A] Mean Abs: 1.680 | Max: 6.122
[LOSS Ex2] A: 0.17170 | B: 0.43048 | C: 0.34276
** [JOINT LOSS] ** : 0.972120
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007217 | Grad Max: 0.205040
  -> Layer: shared_layers.0.bias | Grad Mean: 0.642563 | Grad Max: 2.751889
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.006963
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003828 | Grad Max: 0.003828
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004039 | Grad Max: 0.413030
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.076199 | Grad Max: 2.318005
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000765 | Grad Max: 0.022387
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037433 | Grad Max: 0.159917
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000098 | Grad Max: 0.001053
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008196 | Grad Max: 0.016948
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000516
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002401 | Grad Max: 0.005676
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003006 | Grad Max: 0.006109
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.058096 | Grad Max: 0.058096
[GRADIENT NORM TOTAL] 12.5835

[EPOCH SUMMARY] Train Loss: 0.9646

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9430 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 76/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.551
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50042564 0.49957436] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.042
[MASKS] A(Pass/Fail): 635/1413 | B: 518/1530 | C: 263/1785
[LOSS Ex1] A: 0.66430 | B: 0.65652 | C: 0.65532
[LOGITS Ex2 A] Mean Abs: 1.688 | Max: 5.436
[LOSS Ex2] A: 0.17369 | B: 0.40355 | C: 0.32805
** [JOINT LOSS] ** : 0.960476
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007215 | Grad Max: 0.175573
  -> Layer: shared_layers.0.bias | Grad Mean: 0.520981 | Grad Max: 2.228011
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.006125
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002123 | Grad Max: 0.002123
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003303 | Grad Max: 0.323602
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.061692 | Grad Max: 1.787128
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000610 | Grad Max: 0.018344
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029792 | Grad Max: 0.132651
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000815
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006583 | Grad Max: 0.013256
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000413
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001928 | Grad Max: 0.004642
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002467 | Grad Max: 0.004649
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046444 | Grad Max: 0.046444
[GRADIENT NORM TOTAL] 10.0757

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.349
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6193959  0.38060412] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 618/1430 | B: 482/1374 | C: 295/1753
[LOSS Ex1] A: 0.66166 | B: 0.65982 | C: 0.65131
[LOGITS Ex2 A] Mean Abs: 1.710 | Max: 5.833
[LOSS Ex2] A: 0.19456 | B: 0.36767 | C: 0.28880
** [JOINT LOSS] ** : 0.941270
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002395 | Grad Max: 0.086297
  -> Layer: shared_layers.0.bias | Grad Mean: 0.116746 | Grad Max: 0.614690
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006947
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000049 | Grad Max: 0.000049
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000781 | Grad Max: 0.126645
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013711 | Grad Max: 0.722070
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000125 | Grad Max: 0.006642
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005914 | Grad Max: 0.040219
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000288
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001221 | Grad Max: 0.003586
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000114
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000339 | Grad Max: 0.001064
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000432 | Grad Max: 0.001590
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007128 | Grad Max: 0.007128
[GRADIENT NORM TOTAL] 2.3734

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.425
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5755389 0.4244611] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.043
[MASKS] A(Pass/Fail): 507/1109 | B: 502/1546 | C: 245/1803
[LOSS Ex1] A: 0.66044 | B: 0.65911 | C: 0.65859
[LOGITS Ex2 A] Mean Abs: 1.785 | Max: 6.157
[LOSS Ex2] A: 0.19228 | B: 0.41923 | C: 0.32623
** [JOINT LOSS] ** : 0.971961
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009604 | Grad Max: 0.227020
  -> Layer: shared_layers.0.bias | Grad Mean: 0.613315 | Grad Max: 2.654439
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002024 | Grad Max: 0.006749
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001154 | Grad Max: 0.001154
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004005 | Grad Max: 0.353312
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.075460 | Grad Max: 1.928097
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000749 | Grad Max: 0.021833
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036463 | Grad Max: 0.163876
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001070
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008012 | Grad Max: 0.016831
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000475
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002331 | Grad Max: 0.005367
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002869 | Grad Max: 0.004928
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.054567 | Grad Max: 0.054567
[GRADIENT NORM TOTAL] 11.9006

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.552
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061264  0.49387363] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 640/1408 | B: 518/1530 | C: 260/1788
[LOSS Ex1] A: 0.66066 | B: 0.65977 | C: 0.65648
[LOGITS Ex2 A] Mean Abs: 1.801 | Max: 6.926
[LOSS Ex2] A: 0.20986 | B: 0.43662 | C: 0.35113
** [JOINT LOSS] ** : 0.991508
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014648 | Grad Max: 0.358917
  -> Layer: shared_layers.0.bias | Grad Mean: 0.929795 | Grad Max: 3.897186
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.007049
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004907 | Grad Max: 0.004907
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006068 | Grad Max: 0.574206
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.113869 | Grad Max: 3.114107
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001120 | Grad Max: 0.031238
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.054522 | Grad Max: 0.236907
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000145 | Grad Max: 0.001609
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012016 | Grad Max: 0.024755
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000064 | Grad Max: 0.000765
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003536 | Grad Max: 0.008290
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004696 | Grad Max: 0.008469
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.085741 | Grad Max: 0.085741
[GRADIENT NORM TOTAL] 18.0800

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.500
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.508697   0.49130306] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 619/1429 | B: 519/1529 | C: 289/1759
[LOSS Ex1] A: 0.65890 | B: 0.65642 | C: 0.65418
[LOGITS Ex2 A] Mean Abs: 1.764 | Max: 6.156
[LOSS Ex2] A: 0.19967 | B: 0.42156 | C: 0.31591
** [JOINT LOSS] ** : 0.968877
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011543 | Grad Max: 0.289520
  -> Layer: shared_layers.0.bias | Grad Mean: 0.796664 | Grad Max: 3.239415
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.007668
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004845 | Grad Max: 0.004845
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005074 | Grad Max: 0.504180
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.095532 | Grad Max: 2.736287
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000965 | Grad Max: 0.030094
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047113 | Grad Max: 0.225837
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000125 | Grad Max: 0.001411
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010334 | Grad Max: 0.021575
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.000614
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003040 | Grad Max: 0.007187
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003948 | Grad Max: 0.006641
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.073448 | Grad Max: 0.073448
[GRADIENT NORM TOTAL] 15.4275

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.515
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015361 0.4984639] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 628/1420 | B: 483/1373 | C: 276/1772
[LOSS Ex1] A: 0.65792 | B: 0.65972 | C: 0.65327
[LOGITS Ex2 A] Mean Abs: 1.732 | Max: 6.952
[LOSS Ex2] A: 0.20036 | B: 0.38018 | C: 0.29294
** [JOINT LOSS] ** : 0.948130
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005506 | Grad Max: 0.138064
  -> Layer: shared_layers.0.bias | Grad Mean: 0.347541 | Grad Max: 1.291861
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002250 | Grad Max: 0.008143
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006498 | Grad Max: 0.006498
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.239634
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041751 | Grad Max: 1.306722
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000421 | Grad Max: 0.012831
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020497 | Grad Max: 0.097711
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000645
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004561 | Grad Max: 0.009266
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000291
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001356 | Grad Max: 0.003154
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001836 | Grad Max: 0.003827
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034123 | Grad Max: 0.034123
[GRADIENT NORM TOTAL] 6.7513

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.432
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040813  0.49591866] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 600/1448 | B: 503/1545 | C: 263/1785
[LOSS Ex1] A: 0.66371 | B: 0.65902 | C: 0.65482
[LOGITS Ex2 A] Mean Abs: 1.661 | Max: 6.650
[LOSS Ex2] A: 0.18614 | B: 0.41432 | C: 0.28759
** [JOINT LOSS] ** : 0.955203
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006753 | Grad Max: 0.185751
  -> Layer: shared_layers.0.bias | Grad Mean: 0.375633 | Grad Max: 1.607423
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.005886
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002350 | Grad Max: 0.002350
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002568 | Grad Max: 0.297751
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047903 | Grad Max: 1.683822
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000463 | Grad Max: 0.012822
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022436 | Grad Max: 0.095253
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000706
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004995 | Grad Max: 0.010950
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000329
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001468 | Grad Max: 0.003554
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001967 | Grad Max: 0.003550
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035558 | Grad Max: 0.035558
[GRADIENT NORM TOTAL] 7.6412

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.359
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53137517 0.4686249 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.534 | Std: 0.041
[MASKS] A(Pass/Fail): 595/1453 | B: 519/1529 | C: 278/1770
[LOSS Ex1] A: 0.66346 | B: 0.65969 | C: 0.65344
[LOGITS Ex2 A] Mean Abs: 1.596 | Max: 6.907
[LOSS Ex2] A: 0.20244 | B: 0.43500 | C: 0.32132
** [JOINT LOSS] ** : 0.978450
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011227 | Grad Max: 0.286037
  -> Layer: shared_layers.0.bias | Grad Mean: 0.695478 | Grad Max: 2.929729
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.006518
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002779 | Grad Max: 0.002779
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004551 | Grad Max: 0.550032
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.085066 | Grad Max: 2.994334
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000831 | Grad Max: 0.023480
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.040473 | Grad Max: 0.177985
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000108 | Grad Max: 0.001145
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009010 | Grad Max: 0.018508
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000530
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002665 | Grad Max: 0.005935
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003428 | Grad Max: 0.006562
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.064918 | Grad Max: 0.064918
[GRADIENT NORM TOTAL] 13.6335

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.493
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6457763  0.35422373] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.043
[MASKS] A(Pass/Fail): 665/1383 | B: 520/1528 | C: 286/1762
[LOSS Ex1] A: 0.65951 | B: 0.65633 | C: 0.65183
[LOGITS Ex2 A] Mean Abs: 1.657 | Max: 5.729
[LOSS Ex2] A: 0.19275 | B: 0.40719 | C: 0.30189
** [JOINT LOSS] ** : 0.956497
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010744 | Grad Max: 0.254590
  -> Layer: shared_layers.0.bias | Grad Mean: 0.611231 | Grad Max: 2.507270
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002327 | Grad Max: 0.007695
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007409 | Grad Max: 0.007409
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004114 | Grad Max: 0.454392
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.076652 | Grad Max: 2.529959
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000751 | Grad Max: 0.021763
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036560 | Grad Max: 0.158557
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000098 | Grad Max: 0.000999
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008149 | Grad Max: 0.016390
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000505
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002389 | Grad Max: 0.005931
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003021 | Grad Max: 0.005466
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.056584 | Grad Max: 0.056584
[GRADIENT NORM TOTAL] 12.0100

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.554
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003772  0.49962285] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.042
[MASKS] A(Pass/Fail): 637/1411 | B: 484/1372 | C: 295/1753
[LOSS Ex1] A: 0.66413 | B: 0.65964 | C: 0.65299
[LOGITS Ex2 A] Mean Abs: 1.695 | Max: 5.589
[LOSS Ex2] A: 0.17422 | B: 0.37269 | C: 0.29734
** [JOINT LOSS] ** : 0.940336
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005714 | Grad Max: 0.199589
  -> Layer: shared_layers.0.bias | Grad Mean: 0.251036 | Grad Max: 1.119328
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002088 | Grad Max: 0.006745
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005731 | Grad Max: 0.005731
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001761 | Grad Max: 0.300203
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031810 | Grad Max: 1.686210
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000295 | Grad Max: 0.010782
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014146 | Grad Max: 0.073461
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000482
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003214 | Grad Max: 0.007041
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000243
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000954 | Grad Max: 0.002535
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001252 | Grad Max: 0.002952
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022962 | Grad Max: 0.022962
[GRADIENT NORM TOTAL] 5.3986

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.351
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62007815 0.37992182] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 620/1428 | B: 503/1545 | C: 255/1793
[LOSS Ex1] A: 0.66148 | B: 0.65893 | C: 0.65766
[LOGITS Ex2 A] Mean Abs: 1.723 | Max: 5.419
[LOSS Ex2] A: 0.19513 | B: 0.39762 | C: 0.30536
** [JOINT LOSS] ** : 0.958726
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003993 | Grad Max: 0.113352
  -> Layer: shared_layers.0.bias | Grad Mean: 0.347431 | Grad Max: 1.402507
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002000 | Grad Max: 0.006493
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001657 | Grad Max: 0.001657
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.229580
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041441 | Grad Max: 1.299487
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000415 | Grad Max: 0.013933
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020458 | Grad Max: 0.103260
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000640
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004481 | Grad Max: 0.009906
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000301
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001326 | Grad Max: 0.003113
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001642 | Grad Max: 0.003394
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032586 | Grad Max: 0.032586
[GRADIENT NORM TOTAL] 6.8495

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.427
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57596165 0.42403835] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.043
[MASKS] A(Pass/Fail): 509/1107 | B: 519/1529 | C: 246/1802
[LOSS Ex1] A: 0.66025 | B: 0.65960 | C: 0.65826
[LOGITS Ex2 A] Mean Abs: 1.775 | Max: 5.709
[LOSS Ex2] A: 0.19314 | B: 0.41856 | C: 0.33451
** [JOINT LOSS] ** : 0.974778
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006935 | Grad Max: 0.221296
  -> Layer: shared_layers.0.bias | Grad Mean: 0.627665 | Grad Max: 2.879717
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002019 | Grad Max: 0.006633
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000852 | Grad Max: 0.000852
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003938 | Grad Max: 0.380767
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074655 | Grad Max: 2.126667
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000747 | Grad Max: 0.022521
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036793 | Grad Max: 0.170642
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.001023
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008069 | Grad Max: 0.016022
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000508
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002391 | Grad Max: 0.005689
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003041 | Grad Max: 0.005037
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057661 | Grad Max: 0.057661
[GRADIENT NORM TOTAL] 12.3411

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.555
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061742 0.4938258] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.043
[MASKS] A(Pass/Fail): 644/1404 | B: 521/1527 | C: 298/1750
[LOSS Ex1] A: 0.66049 | B: 0.65624 | C: 0.65395
[LOGITS Ex2 A] Mean Abs: 1.741 | Max: 5.844
[LOSS Ex2] A: 0.19411 | B: 0.39667 | C: 0.33113
** [JOINT LOSS] ** : 0.964196
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007199 | Grad Max: 0.177909
  -> Layer: shared_layers.0.bias | Grad Mean: 0.530358 | Grad Max: 2.375537
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.007133
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004040 | Grad Max: 0.004040
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003396 | Grad Max: 0.342870
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064011 | Grad Max: 1.864028
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000632 | Grad Max: 0.020024
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030936 | Grad Max: 0.144832
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000886
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006812 | Grad Max: 0.013948
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000404
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002019 | Grad Max: 0.004631
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002564 | Grad Max: 0.004694
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048498 | Grad Max: 0.048498
[GRADIENT NORM TOTAL] 10.4090

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.503
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5087496 0.4912504] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 620/1428 | B: 486/1370 | C: 190/1186
[LOSS Ex1] A: 0.65872 | B: 0.65955 | C: 0.65347
[LOGITS Ex2 A] Mean Abs: 1.707 | Max: 5.696
[LOSS Ex2] A: 0.17446 | B: 0.36599 | C: 0.29955
** [JOINT LOSS] ** : 0.937249
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002663 | Grad Max: 0.065839
  -> Layer: shared_layers.0.bias | Grad Mean: 0.051692 | Grad Max: 0.241881
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.007526
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003026 | Grad Max: 0.003026
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000497 | Grad Max: 0.092256
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008381 | Grad Max: 0.520814
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.004689
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002677 | Grad Max: 0.020695
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000175
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000615 | Grad Max: 0.002438
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000099
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000188 | Grad Max: 0.000892
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000562 | Grad Max: 0.001581
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004982 | Grad Max: 0.004982
[GRADIENT NORM TOTAL] 1.5265

[EPOCH SUMMARY] Train Loss: 0.9605

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9404 | Alpha: 0.5500
No improve count: 3/15

############################## EPOCH 77/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.518
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50158155 0.49841845] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 629/1419 | B: 506/1542 | C: 258/1790
[LOSS Ex1] A: 0.65773 | B: 0.65885 | C: 0.65454
[LOGITS Ex2 A] Mean Abs: 1.649 | Max: 6.326
[LOSS Ex2] A: 0.20595 | B: 0.43135 | C: 0.30266
** [JOINT LOSS] ** : 0.970356
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006368 | Grad Max: 0.159190
  -> Layer: shared_layers.0.bias | Grad Mean: 0.494131 | Grad Max: 2.092587
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002236 | Grad Max: 0.007461
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002225 | Grad Max: 0.002225
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003099 | Grad Max: 0.366914
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058439 | Grad Max: 2.063640
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000567 | Grad Max: 0.018361
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027897 | Grad Max: 0.130908
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000797
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006114 | Grad Max: 0.012796
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000376
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001810 | Grad Max: 0.004156
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002302 | Grad Max: 0.004390
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043933 | Grad Max: 0.043933
[GRADIENT NORM TOTAL] 9.5870

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.434
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040698 0.4959302] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 601/1447 | B: 519/1529 | C: 269/1779
[LOSS Ex1] A: 0.66355 | B: 0.65951 | C: 0.65506
[LOGITS Ex2 A] Mean Abs: 1.597 | Max: 5.624
[LOSS Ex2] A: 0.18634 | B: 0.43646 | C: 0.31715
** [JOINT LOSS] ** : 0.972695
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009568 | Grad Max: 0.226326
  -> Layer: shared_layers.0.bias | Grad Mean: 0.667073 | Grad Max: 2.783952
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.006054
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002685 | Grad Max: 0.002685
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004266 | Grad Max: 0.434856
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.080497 | Grad Max: 2.372323
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000801 | Grad Max: 0.025499
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039358 | Grad Max: 0.192007
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001112
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008690 | Grad Max: 0.018427
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000045 | Grad Max: 0.000548
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002566 | Grad Max: 0.006073
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003245 | Grad Max: 0.006172
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.061642 | Grad Max: 0.061642
[GRADIENT NORM TOTAL] 12.8492

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.361
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53153455 0.46846545] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.534 | Std: 0.041
[MASKS] A(Pass/Fail): 595/1453 | B: 521/1527 | C: 270/1778
[LOSS Ex1] A: 0.66330 | B: 0.65614 | C: 0.65448
[LOGITS Ex2 A] Mean Abs: 1.602 | Max: 5.552
[LOSS Ex2] A: 0.19815 | B: 0.40373 | C: 0.31166
** [JOINT LOSS] ** : 0.962482
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008435 | Grad Max: 0.202870
  -> Layer: shared_layers.0.bias | Grad Mean: 0.502927 | Grad Max: 2.160730
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.006475
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002889 | Grad Max: 0.002889
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003221 | Grad Max: 0.352473
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060652 | Grad Max: 1.941685
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000612 | Grad Max: 0.017428
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029892 | Grad Max: 0.129145
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000919
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006614 | Grad Max: 0.014140
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000435
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001957 | Grad Max: 0.004901
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002537 | Grad Max: 0.004560
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046971 | Grad Max: 0.046971
[GRADIENT NORM TOTAL] 9.5342

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.496
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6467272  0.35327277] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.043
[MASKS] A(Pass/Fail): 666/1382 | B: 487/1369 | C: 299/1749
[LOSS Ex1] A: 0.65932 | B: 0.65945 | C: 0.65157
[LOGITS Ex2 A] Mean Abs: 1.686 | Max: 5.826
[LOSS Ex2] A: 0.17760 | B: 0.36299 | C: 0.31092
** [JOINT LOSS] ** : 0.940614
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001662 | Grad Max: 0.043027
  -> Layer: shared_layers.0.bias | Grad Mean: 0.052840 | Grad Max: 0.228319
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002224 | Grad Max: 0.007701
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007218 | Grad Max: 0.007218
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000456 | Grad Max: 0.072409
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007948 | Grad Max: 0.391340
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000064 | Grad Max: 0.003280
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002765 | Grad Max: 0.020500
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000224
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000555 | Grad Max: 0.002718
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000095
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000156 | Grad Max: 0.000636
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000467 | Grad Max: 0.001486
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003531 | Grad Max: 0.003531
[GRADIENT NORM TOTAL] 1.3861

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.557
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004179  0.49958214] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 638/1410 | B: 506/1542 | C: 278/1770
[LOSS Ex1] A: 0.66396 | B: 0.65874 | C: 0.65284
[LOGITS Ex2 A] Mean Abs: 1.744 | Max: 5.326
[LOSS Ex2] A: 0.18184 | B: 0.41925 | C: 0.33741
** [JOINT LOSS] ** : 0.971345
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007682 | Grad Max: 0.197931
  -> Layer: shared_layers.0.bias | Grad Mean: 0.626019 | Grad Max: 2.501388
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.006042
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000189 | Grad Max: 0.000189
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003940 | Grad Max: 0.369041
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074705 | Grad Max: 2.074791
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000756 | Grad Max: 0.024334
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037276 | Grad Max: 0.184800
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001105
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008164 | Grad Max: 0.017521
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000509
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002402 | Grad Max: 0.005532
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002950 | Grad Max: 0.005171
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.056426 | Grad Max: 0.056426
[GRADIENT NORM TOTAL] 12.0735

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.354
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6208825  0.37911752] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 620/1428 | B: 520/1528 | C: 278/1770
[LOSS Ex1] A: 0.66128 | B: 0.65939 | C: 0.65583
[LOGITS Ex2 A] Mean Abs: 1.757 | Max: 5.248
[LOSS Ex2] A: 0.21473 | B: 0.44576 | C: 0.33553
** [JOINT LOSS] ** : 0.990844
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010248 | Grad Max: 0.253007
  -> Layer: shared_layers.0.bias | Grad Mean: 0.840399 | Grad Max: 3.459207
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002027 | Grad Max: 0.006545
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002185 | Grad Max: 0.002185
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005239 | Grad Max: 0.530590
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.098656 | Grad Max: 3.000314
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000990 | Grad Max: 0.031657
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048980 | Grad Max: 0.238478
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000127 | Grad Max: 0.001460
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010787 | Grad Max: 0.022586
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.000669
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003206 | Grad Max: 0.007412
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.004189 | Grad Max: 0.007656
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.077485 | Grad Max: 0.077485
[GRADIENT NORM TOTAL] 16.3654

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.430
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5764949  0.42350516] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.043
[MASKS] A(Pass/Fail): 509/1107 | B: 522/1526 | C: 289/1759
[LOSS Ex1] A: 0.66005 | B: 0.65601 | C: 0.65127
[LOGITS Ex2 A] Mean Abs: 1.800 | Max: 6.356
[LOSS Ex2] A: 0.19830 | B: 0.41623 | C: 0.30846
** [JOINT LOSS] ** : 0.963443
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007875 | Grad Max: 0.239320
  -> Layer: shared_layers.0.bias | Grad Mean: 0.725619 | Grad Max: 3.105739
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002230 | Grad Max: 0.007230
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006899 | Grad Max: 0.006899
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004418 | Grad Max: 0.464533
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.083639 | Grad Max: 2.583216
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000848 | Grad Max: 0.030269
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.042083 | Grad Max: 0.215587
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000109 | Grad Max: 0.001218
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009246 | Grad Max: 0.018792
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000579
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002738 | Grad Max: 0.006525
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003415 | Grad Max: 0.005775
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.065069 | Grad Max: 0.065069
[GRADIENT NORM TOTAL] 13.9772

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.558
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061977 0.4938023] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.044
[MASKS] A(Pass/Fail): 644/1404 | B: 489/1367 | C: 266/1782
[LOSS Ex1] A: 0.66028 | B: 0.65933 | C: 0.65509
[LOGITS Ex2 A] Mean Abs: 1.745 | Max: 6.127
[LOSS Ex2] A: 0.18292 | B: 0.37559 | C: 0.30535
** [JOINT LOSS] ** : 0.946187
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003820 | Grad Max: 0.123193
  -> Layer: shared_layers.0.bias | Grad Mean: 0.340883 | Grad Max: 1.366789
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.007477
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010207 | Grad Max: 0.010207
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.213513
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038901 | Grad Max: 1.209005
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000412 | Grad Max: 0.014977
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020233 | Grad Max: 0.110579
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000617
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004379 | Grad Max: 0.008962
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000274
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001312 | Grad Max: 0.003087
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001654 | Grad Max: 0.003496
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031776 | Grad Max: 0.031776
[GRADIENT NORM TOTAL] 6.4645

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.507
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5088226 0.4911774] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.043
[MASKS] A(Pass/Fail): 621/1427 | B: 507/1541 | C: 279/1769
[LOSS Ex1] A: 0.65849 | B: 0.65862 | C: 0.65335
[LOGITS Ex2 A] Mean Abs: 1.682 | Max: 5.613
[LOSS Ex2] A: 0.18409 | B: 0.41492 | C: 0.31056
** [JOINT LOSS] ** : 0.960010
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005482 | Grad Max: 0.148315
  -> Layer: shared_layers.0.bias | Grad Mean: 0.412188 | Grad Max: 1.834075
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002321 | Grad Max: 0.008382
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011319 | Grad Max: 0.011319
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002662 | Grad Max: 0.417309
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049662 | Grad Max: 2.345150
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000471 | Grad Max: 0.015661
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023155 | Grad Max: 0.108312
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000651
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005162 | Grad Max: 0.010679
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000332
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001536 | Grad Max: 0.003785
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001915 | Grad Max: 0.003634
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036007 | Grad Max: 0.036007
[GRADIENT NORM TOTAL] 8.5870

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.522
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015819  0.49841812] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.043
[MASKS] A(Pass/Fail): 630/1418 | B: 522/1526 | C: 266/1782
[LOSS Ex1] A: 0.65751 | B: 0.65929 | C: 0.65559
[LOGITS Ex2 A] Mean Abs: 1.635 | Max: 7.033
[LOSS Ex2] A: 0.19972 | B: 0.43016 | C: 0.31834
** [JOINT LOSS] ** : 0.973532
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010211 | Grad Max: 0.246863
  -> Layer: shared_layers.0.bias | Grad Mean: 0.741150 | Grad Max: 3.030818
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.007673
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002843 | Grad Max: 0.002843
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004761 | Grad Max: 0.552221
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.090088 | Grad Max: 3.059556
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000891 | Grad Max: 0.025051
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.044063 | Grad Max: 0.189829
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000116 | Grad Max: 0.001342
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009768 | Grad Max: 0.020651
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000623
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002905 | Grad Max: 0.006954
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003720 | Grad Max: 0.007017
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.070417 | Grad Max: 0.070417
[GRADIENT NORM TOTAL] 14.6455

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.437
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50400954 0.49599043] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.043
[MASKS] A(Pass/Fail): 602/1446 | B: 523/1525 | C: 261/1787
[LOSS Ex1] A: 0.66337 | B: 0.65590 | C: 0.65756
[LOGITS Ex2 A] Mean Abs: 1.619 | Max: 6.258
[LOSS Ex2] A: 0.19459 | B: 0.41948 | C: 0.31282
** [JOINT LOSS] ** : 0.967908
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009789 | Grad Max: 0.264185
  -> Layer: shared_layers.0.bias | Grad Mean: 0.669376 | Grad Max: 2.738951
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.006058
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000509 | Grad Max: 0.000509
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004382 | Grad Max: 0.530105
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.082452 | Grad Max: 2.954613
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000808 | Grad Max: 0.024109
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039877 | Grad Max: 0.182125
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000105 | Grad Max: 0.001231
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008874 | Grad Max: 0.018222
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000569
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002629 | Grad Max: 0.006257
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003343 | Grad Max: 0.006017
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.062988 | Grad Max: 0.062988
[GRADIENT NORM TOTAL] 13.1925

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.364
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53175616 0.46824378] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 600/1448 | B: 489/1367 | C: 279/1769
[LOSS Ex1] A: 0.66312 | B: 0.65922 | C: 0.65554
[LOGITS Ex2 A] Mean Abs: 1.627 | Max: 5.553
[LOSS Ex2] A: 0.18806 | B: 0.37940 | C: 0.29645
** [JOINT LOSS] ** : 0.947268
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005722 | Grad Max: 0.171567
  -> Layer: shared_layers.0.bias | Grad Mean: 0.386305 | Grad Max: 1.575985
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.006753
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008523 | Grad Max: 0.008523
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002507 | Grad Max: 0.312456
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046596 | Grad Max: 1.770070
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000470 | Grad Max: 0.014699
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023147 | Grad Max: 0.110083
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000735
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005134 | Grad Max: 0.011671
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000337
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001511 | Grad Max: 0.003678
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001826 | Grad Max: 0.003784
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034802 | Grad Max: 0.034802
[GRADIENT NORM TOTAL] 7.6180

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.499
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64783424 0.35216576] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.538 | Std: 0.044
[MASKS] A(Pass/Fail): 669/1379 | B: 507/1541 | C: 272/1776
[LOSS Ex1] A: 0.65912 | B: 0.65852 | C: 0.65373
[LOGITS Ex2 A] Mean Abs: 1.734 | Max: 6.611
[LOSS Ex2] A: 0.17755 | B: 0.39490 | C: 0.29574
** [JOINT LOSS] ** : 0.946518
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004812 | Grad Max: 0.122245
  -> Layer: shared_layers.0.bias | Grad Mean: 0.239727 | Grad Max: 1.148506
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.007506
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008350 | Grad Max: 0.008350
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001579 | Grad Max: 0.153589
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029319 | Grad Max: 0.820261
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.009712
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014165 | Grad Max: 0.073763
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000528
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003177 | Grad Max: 0.007571
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000226
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000946 | Grad Max: 0.002157
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001200 | Grad Max: 0.003076
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023199 | Grad Max: 0.023199
[GRADIENT NORM TOTAL] 4.6084

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.560
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500487 0.499513] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 644/1404 | B: 522/1526 | C: 179/1197
[LOSS Ex1] A: 0.66379 | B: 0.65919 | C: 0.65457
[LOGITS Ex2 A] Mean Abs: 1.741 | Max: 5.586
[LOSS Ex2] A: 0.18591 | B: 0.40150 | C: 0.33568
** [JOINT LOSS] ** : 0.966876
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007214 | Grad Max: 0.170176
  -> Layer: shared_layers.0.bias | Grad Mean: 0.487438 | Grad Max: 2.268451
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.006209
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002696 | Grad Max: 0.002696
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003078 | Grad Max: 0.293656
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058063 | Grad Max: 1.604393
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000568 | Grad Max: 0.019736
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028085 | Grad Max: 0.145123
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000898
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006201 | Grad Max: 0.013845
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000416
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001844 | Grad Max: 0.004397
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002367 | Grad Max: 0.004209
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044261 | Grad Max: 0.044261
[GRADIENT NORM TOTAL] 9.4990

[EPOCH SUMMARY] Train Loss: 0.9629

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9368 | Alpha: 0.5500
No improve count: 4/15

############################## EPOCH 78/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.356
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62171084 0.37828916] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 623/1425 | B: 524/1524 | C: 256/1792
[LOSS Ex1] A: 0.66110 | B: 0.65580 | C: 0.65545
[LOGITS Ex2 A] Mean Abs: 1.723 | Max: 5.613
[LOSS Ex2] A: 0.19033 | B: 0.38589 | C: 0.29578
** [JOINT LOSS] ** : 0.948115
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006113 | Grad Max: 0.143883
  -> Layer: shared_layers.0.bias | Grad Mean: 0.389257 | Grad Max: 1.704046
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.007009
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006667 | Grad Max: 0.006667
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002479 | Grad Max: 0.258095
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046315 | Grad Max: 1.391597
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000450 | Grad Max: 0.015012
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022176 | Grad Max: 0.110390
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000696
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004930 | Grad Max: 0.010262
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000291
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001474 | Grad Max: 0.003348
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001876 | Grad Max: 0.004056
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035714 | Grad Max: 0.035714
[GRADIENT NORM TOTAL] 7.5601

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.432
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57700163 0.42299837] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.044
[MASKS] A(Pass/Fail): 510/1106 | B: 490/1366 | C: 266/1782
[LOSS Ex1] A: 0.65986 | B: 0.65913 | C: 0.65420
[LOGITS Ex2 A] Mean Abs: 1.722 | Max: 5.876
[LOSS Ex2] A: 0.17384 | B: 0.37484 | C: 0.29398
** [JOINT LOSS] ** : 0.938614
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001672 | Grad Max: 0.024073
  -> Layer: shared_layers.0.bias | Grad Mean: 0.079891 | Grad Max: 0.312055
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.006905
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001533 | Grad Max: 0.001533
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000673 | Grad Max: 0.105138
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011952 | Grad Max: 0.583209
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000102 | Grad Max: 0.004171
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004845 | Grad Max: 0.029090
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000292
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001077 | Grad Max: 0.003361
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000097
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000318 | Grad Max: 0.000912
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000529 | Grad Max: 0.001728
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007362 | Grad Max: 0.007362
[GRADIENT NORM TOTAL] 1.9281

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.561
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.506181   0.49381894] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.044
[MASKS] A(Pass/Fail): 646/1402 | B: 508/1540 | C: 305/1743
[LOSS Ex1] A: 0.66010 | B: 0.65842 | C: 0.65230
[LOGITS Ex2 A] Mean Abs: 1.694 | Max: 6.213
[LOSS Ex2] A: 0.17284 | B: 0.40449 | C: 0.30650
** [JOINT LOSS] ** : 0.951545
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002521 | Grad Max: 0.073056
  -> Layer: shared_layers.0.bias | Grad Mean: 0.151931 | Grad Max: 0.687813
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.006952
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002913 | Grad Max: 0.002913
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000990 | Grad Max: 0.131348
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017977 | Grad Max: 0.741198
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000188 | Grad Max: 0.006936
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009293 | Grad Max: 0.050312
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000384
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002080 | Grad Max: 0.005514
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000159
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000628 | Grad Max: 0.001661
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000923 | Grad Max: 0.002564
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015836 | Grad Max: 0.015836
[GRADIENT NORM TOTAL] 2.9648

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.509
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5089145 0.4910855] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.044
[MASKS] A(Pass/Fail): 628/1420 | B: 522/1526 | C: 277/1771
[LOSS Ex1] A: 0.65829 | B: 0.65908 | C: 0.65377
[LOGITS Ex2 A] Mean Abs: 1.710 | Max: 5.427
[LOSS Ex2] A: 0.17717 | B: 0.39666 | C: 0.29575
** [JOINT LOSS] ** : 0.946906
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003172 | Grad Max: 0.091576
  -> Layer: shared_layers.0.bias | Grad Mean: 0.094382 | Grad Max: 0.384567
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.007683
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005887 | Grad Max: 0.005887
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000718 | Grad Max: 0.132072
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012805 | Grad Max: 0.704798
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.003787
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005119 | Grad Max: 0.022268
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000261
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001155 | Grad Max: 0.003461
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000107
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000334 | Grad Max: 0.001010
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000452 | Grad Max: 0.001606
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007132 | Grad Max: 0.007132
[GRADIENT NORM TOTAL] 2.1391

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.525
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50156283 0.49843723] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.043
[MASKS] A(Pass/Fail): 634/1414 | B: 524/1524 | C: 272/1776
[LOSS Ex1] A: 0.65730 | B: 0.65567 | C: 0.65338
[LOGITS Ex2 A] Mean Abs: 1.701 | Max: 6.728
[LOSS Ex2] A: 0.19324 | B: 0.38520 | C: 0.29293
** [JOINT LOSS] ** : 0.945906
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001968 | Grad Max: 0.063630
  -> Layer: shared_layers.0.bias | Grad Mean: 0.049847 | Grad Max: 0.246486
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002251 | Grad Max: 0.007297
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001312 | Grad Max: 0.001312
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000522 | Grad Max: 0.100372
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008438 | Grad Max: 0.530700
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.002433
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002394 | Grad Max: 0.017983
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000160
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000552 | Grad Max: 0.002223
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000092
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000175 | Grad Max: 0.000713
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000446 | Grad Max: 0.001348
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004291 | Grad Max: 0.004291
[GRADIENT NORM TOTAL] 1.4414

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.440
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.503967 0.496033] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.535 | Std: 0.043
[MASKS] A(Pass/Fail): 605/1443 | B: 491/1365 | C: 278/1770
[LOSS Ex1] A: 0.66317 | B: 0.65899 | C: 0.65436
[LOGITS Ex2 A] Mean Abs: 1.653 | Max: 6.154
[LOSS Ex2] A: 0.17732 | B: 0.37634 | C: 0.30652
** [JOINT LOSS] ** : 0.945565
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004364 | Grad Max: 0.157970
  -> Layer: shared_layers.0.bias | Grad Mean: 0.223747 | Grad Max: 0.855146
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.006169
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000255 | Grad Max: 0.000255
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001606 | Grad Max: 0.224761
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029733 | Grad Max: 1.264964
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000276 | Grad Max: 0.009864
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013412 | Grad Max: 0.064804
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000464
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003015 | Grad Max: 0.006691
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000211
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000882 | Grad Max: 0.002272
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001123 | Grad Max: 0.002665
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020368 | Grad Max: 0.020368
[GRADIENT NORM TOTAL] 4.6059

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.368
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.532057   0.46794304] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 601/1447 | B: 510/1538 | C: 253/1795
[LOSS Ex1] A: 0.66291 | B: 0.65826 | C: 0.65439
[LOGITS Ex2 A] Mean Abs: 1.658 | Max: 6.412
[LOSS Ex2] A: 0.18077 | B: 0.39455 | C: 0.29404
** [JOINT LOSS] ** : 0.948305
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004311 | Grad Max: 0.156626
  -> Layer: shared_layers.0.bias | Grad Mean: 0.106630 | Grad Max: 0.433461
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.006174
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006361 | Grad Max: 0.006361
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000863 | Grad Max: 0.215841
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014964 | Grad Max: 1.212016
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000131 | Grad Max: 0.004129
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006050 | Grad Max: 0.024584
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000264
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001424 | Grad Max: 0.003873
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000138
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000427 | Grad Max: 0.001254
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000638 | Grad Max: 0.001867
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010135 | Grad Max: 0.010135
[GRADIENT NORM TOTAL] 2.5360

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.502
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64932144 0.3506786 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.538 | Std: 0.044
[MASKS] A(Pass/Fail): 670/1378 | B: 524/1524 | C: 273/1775
[LOSS Ex1] A: 0.65885 | B: 0.65891 | C: 0.65416
[LOGITS Ex2 A] Mean Abs: 1.722 | Max: 6.391
[LOSS Ex2] A: 0.16983 | B: 0.39835 | C: 0.32468
** [JOINT LOSS] ** : 0.954930
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003122 | Grad Max: 0.122284
  -> Layer: shared_layers.0.bias | Grad Mean: 0.336169 | Grad Max: 1.583581
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002214 | Grad Max: 0.007469
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008245 | Grad Max: 0.008245
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.225895
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039828 | Grad Max: 1.273723
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000396 | Grad Max: 0.012061
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019730 | Grad Max: 0.086214
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000635
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004306 | Grad Max: 0.009279
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000258
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001260 | Grad Max: 0.002893
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001536 | Grad Max: 0.003230
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029419 | Grad Max: 0.029419
[GRADIENT NORM TOTAL] 6.9280

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.564
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005332  0.49946684] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.043
[MASKS] A(Pass/Fail): 647/1401 | B: 526/1522 | C: 269/1779
[LOSS Ex1] A: 0.66354 | B: 0.65549 | C: 0.65592
[LOGITS Ex2 A] Mean Abs: 1.740 | Max: 6.005
[LOSS Ex2] A: 0.17150 | B: 0.38376 | C: 0.30887
** [JOINT LOSS] ** : 0.946362
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003748 | Grad Max: 0.138263
  -> Layer: shared_layers.0.bias | Grad Mean: 0.379119 | Grad Max: 1.830999
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.006139
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000790 | Grad Max: 0.000790
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002391 | Grad Max: 0.236937
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044721 | Grad Max: 1.310992
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000443 | Grad Max: 0.014002
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022033 | Grad Max: 0.098024
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000712
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004801 | Grad Max: 0.010216
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000305
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001409 | Grad Max: 0.003414
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001746 | Grad Max: 0.003781
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033359 | Grad Max: 0.033359
[GRADIENT NORM TOTAL] 7.6388

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.360
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6230212  0.37697875] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.044
[MASKS] A(Pass/Fail): 626/1422 | B: 493/1363 | C: 270/1778
[LOSS Ex1] A: 0.66081 | B: 0.65882 | C: 0.65502
[LOGITS Ex2 A] Mean Abs: 1.720 | Max: 6.702
[LOSS Ex2] A: 0.18829 | B: 0.37044 | C: 0.31236
** [JOINT LOSS] ** : 0.948583
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001938 | Grad Max: 0.046554
  -> Layer: shared_layers.0.bias | Grad Mean: 0.063250 | Grad Max: 0.306866
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.006567
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004539 | Grad Max: 0.004539
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000544 | Grad Max: 0.125344
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009021 | Grad Max: 0.700514
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.003183
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001804 | Grad Max: 0.016240
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000174
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000327 | Grad Max: 0.001761
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000078
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000108 | Grad Max: 0.000663
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000384 | Grad Max: 0.001145
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002938 | Grad Max: 0.002938
[GRADIENT NORM TOTAL] 1.7795

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.436
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5779116  0.42208838] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.538 | Std: 0.044
[MASKS] A(Pass/Fail): 513/1103 | B: 512/1536 | C: 285/1763
[LOSS Ex1] A: 0.65954 | B: 0.65809 | C: 0.65202
[LOGITS Ex2 A] Mean Abs: 1.726 | Max: 5.374
[LOSS Ex2] A: 0.17224 | B: 0.40595 | C: 0.29433
** [JOINT LOSS] ** : 0.947390
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004911 | Grad Max: 0.115427
  -> Layer: shared_layers.0.bias | Grad Mean: 0.328948 | Grad Max: 1.366957
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.006829
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001876 | Grad Max: 0.001876
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.283182
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039069 | Grad Max: 1.588446
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000369 | Grad Max: 0.011198
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018242 | Grad Max: 0.084266
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000627
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004041 | Grad Max: 0.008989
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000256
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001199 | Grad Max: 0.002748
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001650 | Grad Max: 0.003315
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029446 | Grad Max: 0.029446
[GRADIENT NORM TOTAL] 6.7808

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.566
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.506221   0.49377906] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.538 | Std: 0.044
[MASKS] A(Pass/Fail): 648/1400 | B: 526/1522 | C: 302/1746
[LOSS Ex1] A: 0.65977 | B: 0.65875 | C: 0.65000
[LOGITS Ex2 A] Mean Abs: 1.711 | Max: 5.904
[LOSS Ex2] A: 0.17493 | B: 0.40569 | C: 0.28381
** [JOINT LOSS] ** : 0.944319
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003738 | Grad Max: 0.107973
  -> Layer: shared_layers.0.bias | Grad Mean: 0.306941 | Grad Max: 1.478740
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002244 | Grad Max: 0.007533
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007567 | Grad Max: 0.007567
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001934 | Grad Max: 0.263368
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036198 | Grad Max: 1.476573
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000335 | Grad Max: 0.012408
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016639 | Grad Max: 0.089099
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000536
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003601 | Grad Max: 0.008065
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000245
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001049 | Grad Max: 0.002650
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001357 | Grad Max: 0.002913
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024802 | Grad Max: 0.024802
[GRADIENT NORM TOTAL] 6.3175

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.515
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5090646  0.49093536] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.044
[MASKS] A(Pass/Fail): 630/1418 | B: 529/1519 | C: 263/1785
[LOSS Ex1] A: 0.65792 | B: 0.65532 | C: 0.65518
[LOGITS Ex2 A] Mean Abs: 1.736 | Max: 5.602
[LOSS Ex2] A: 0.17778 | B: 0.38006 | C: 0.30747
** [JOINT LOSS] ** : 0.944578
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004251 | Grad Max: 0.167864
  -> Layer: shared_layers.0.bias | Grad Mean: 0.064492 | Grad Max: 0.310871
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002251 | Grad Max: 0.007522
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003842 | Grad Max: 0.003842
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000689 | Grad Max: 0.113062
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011099 | Grad Max: 0.594660
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.003280
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002693 | Grad Max: 0.019047
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000206
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000639 | Grad Max: 0.002233
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000186 | Grad Max: 0.000680
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000355 | Grad Max: 0.001390
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004621 | Grad Max: 0.004621
[GRADIENT NORM TOTAL] 1.8802

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.531
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50161123 0.49838874] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.044
[MASKS] A(Pass/Fail): 634/1414 | B: 494/1362 | C: 180/1196
[LOSS Ex1] A: 0.65691 | B: 0.65866 | C: 0.65502
[LOGITS Ex2 A] Mean Abs: 1.750 | Max: 8.022
[LOSS Ex2] A: 0.20003 | B: 0.36335 | C: 0.31305
** [JOINT LOSS] ** : 0.949003
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004577 | Grad Max: 0.185731
  -> Layer: shared_layers.0.bias | Grad Mean: 0.114685 | Grad Max: 0.377449
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.007257
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000673 | Grad Max: 0.000673
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001006 | Grad Max: 0.122711
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017368 | Grad Max: 0.690644
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000151 | Grad Max: 0.005547
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007055 | Grad Max: 0.042188
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000331
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001682 | Grad Max: 0.004613
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000158
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000502 | Grad Max: 0.001485
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000660 | Grad Max: 0.002127
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011987 | Grad Max: 0.011987
[GRADIENT NORM TOTAL] 2.6601

[EPOCH SUMMARY] Train Loss: 0.9472

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9263 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9346 -> New: 0.9263)

############################## EPOCH 79/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.445
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040043  0.49599573] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 607/1441 | B: 513/1535 | C: 295/1753
[LOSS Ex1] A: 0.66286 | B: 0.65792 | C: 0.65190
[LOGITS Ex2 A] Mean Abs: 1.708 | Max: 5.648
[LOSS Ex2] A: 0.17109 | B: 0.39931 | C: 0.29661
** [JOINT LOSS] ** : 0.946563
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002451 | Grad Max: 0.103064
  -> Layer: shared_layers.0.bias | Grad Mean: 0.077792 | Grad Max: 0.413695
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.006060
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002294 | Grad Max: 0.002294
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000704 | Grad Max: 0.081625
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012203 | Grad Max: 0.453901
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.004265
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005419 | Grad Max: 0.026384
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000301
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001281 | Grad Max: 0.003528
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000112
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000391 | Grad Max: 0.001143
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000617 | Grad Max: 0.001752
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010257 | Grad Max: 0.010257
[GRADIENT NORM TOTAL] 1.8384

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.373
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5324548  0.46754518] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.535 | Std: 0.042
[MASKS] A(Pass/Fail): 603/1445 | B: 528/1520 | C: 266/1782
[LOSS Ex1] A: 0.66258 | B: 0.65857 | C: 0.65282
[LOGITS Ex2 A] Mean Abs: 1.685 | Max: 5.575
[LOSS Ex2] A: 0.18429 | B: 0.39141 | C: 0.28748
** [JOINT LOSS] ** : 0.945721
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003626 | Grad Max: 0.147380
  -> Layer: shared_layers.0.bias | Grad Mean: 0.073593 | Grad Max: 0.311938
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.006505
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003817 | Grad Max: 0.003817
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000695 | Grad Max: 0.140898
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010906 | Grad Max: 0.802423
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.004125
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001874 | Grad Max: 0.025112
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000185
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000252 | Grad Max: 0.001738
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000067 | Grad Max: 0.000355
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000414 | Grad Max: 0.001197
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000384 | Grad Max: 0.000384
[GRADIENT NORM TOTAL] 2.0170

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.507
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.65120375 0.34879625] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.045
[MASKS] A(Pass/Fail): 673/1375 | B: 530/1518 | C: 272/1776
[LOSS Ex1] A: 0.65847 | B: 0.65512 | C: 0.65174
[LOGITS Ex2 A] Mean Abs: 1.757 | Max: 6.255
[LOSS Ex2] A: 0.17917 | B: 0.37153 | C: 0.29102
** [JOINT LOSS] ** : 0.935683
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002653 | Grad Max: 0.077887
  -> Layer: shared_layers.0.bias | Grad Mean: 0.191369 | Grad Max: 0.977474
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.007265
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005062 | Grad Max: 0.005062
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001285 | Grad Max: 0.171834
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022967 | Grad Max: 0.974400
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.008487
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009798 | Grad Max: 0.066481
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000324
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002035 | Grad Max: 0.004794
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000133
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000599 | Grad Max: 0.001487
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000753 | Grad Max: 0.002329
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015087 | Grad Max: 0.015087
[GRADIENT NORM TOTAL] 4.1031

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.570
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005087  0.49949127] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.044
[MASKS] A(Pass/Fail): 648/1400 | B: 494/1362 | C: 279/1769
[LOSS Ex1] A: 0.66321 | B: 0.65846 | C: 0.65420
[LOGITS Ex2 A] Mean Abs: 1.736 | Max: 5.979
[LOSS Ex2] A: 0.17106 | B: 0.37261 | C: 0.29870
** [JOINT LOSS] ** : 0.939409
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003716 | Grad Max: 0.127788
  -> Layer: shared_layers.0.bias | Grad Mean: 0.060660 | Grad Max: 0.276597
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.006691
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004984 | Grad Max: 0.004984
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000648 | Grad Max: 0.149375
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010656 | Grad Max: 0.822123
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000081 | Grad Max: 0.003011
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003539 | Grad Max: 0.017238
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000176
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000904 | Grad Max: 0.002841
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000115
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000270 | Grad Max: 0.001147
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000491 | Grad Max: 0.001879
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006570 | Grad Max: 0.006570
[GRADIENT NORM TOTAL] 1.7765

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.365
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62449944 0.37550056] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.044
[MASKS] A(Pass/Fail): 629/1419 | B: 514/1534 | C: 253/1795
[LOSS Ex1] A: 0.66043 | B: 0.65771 | C: 0.65654
[LOGITS Ex2 A] Mean Abs: 1.733 | Max: 5.716
[LOSS Ex2] A: 0.18506 | B: 0.39926 | C: 0.30108
** [JOINT LOSS] ** : 0.953363
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002223 | Grad Max: 0.045623
  -> Layer: shared_layers.0.bias | Grad Mean: 0.057481 | Grad Max: 0.290084
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.007100
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006154 | Grad Max: 0.006154
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000484 | Grad Max: 0.156067
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008221 | Grad Max: 0.878404
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.003204
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002469 | Grad Max: 0.017738
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000153
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000542 | Grad Max: 0.002370
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000087
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000168 | Grad Max: 0.000685
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000279 | Grad Max: 0.001220
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004460 | Grad Max: 0.004460
[GRADIENT NORM TOTAL] 1.7477

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.441
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57888484 0.42111516] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.538 | Std: 0.045
[MASKS] A(Pass/Fail): 515/1101 | B: 529/1519 | C: 303/1745
[LOSS Ex1] A: 0.65914 | B: 0.65836 | C: 0.65112
[LOGITS Ex2 A] Mean Abs: 1.753 | Max: 5.858
[LOSS Ex2] A: 0.17276 | B: 0.39956 | C: 0.30142
** [JOINT LOSS] ** : 0.947457
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002764 | Grad Max: 0.073098
  -> Layer: shared_layers.0.bias | Grad Mean: 0.166055 | Grad Max: 0.879594
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006944
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005528 | Grad Max: 0.005528
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001111 | Grad Max: 0.182210
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019750 | Grad Max: 1.014890
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000173 | Grad Max: 0.007665
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008454 | Grad Max: 0.055542
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000296
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001744 | Grad Max: 0.004194
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000142
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000512 | Grad Max: 0.001384
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000713 | Grad Max: 0.001892
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013078 | Grad Max: 0.013078
[GRADIENT NORM TOTAL] 3.5383

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.572
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063304 0.4936696] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.538 | Std: 0.045
[MASKS] A(Pass/Fail): 648/1400 | B: 531/1517 | C: 266/1782
[LOSS Ex1] A: 0.65938 | B: 0.65489 | C: 0.65562
[LOGITS Ex2 A] Mean Abs: 1.754 | Max: 6.901
[LOSS Ex2] A: 0.17753 | B: 0.36632 | C: 0.30605
** [JOINT LOSS] ** : 0.939928
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005196 | Grad Max: 0.226777
  -> Layer: shared_layers.0.bias | Grad Mean: 0.062685 | Grad Max: 0.273639
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.006561
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000360 | Grad Max: 0.000360
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000707 | Grad Max: 0.083520
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010466 | Grad Max: 0.465641
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.003081
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002571 | Grad Max: 0.014888
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000676 | Grad Max: 0.002419
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000089
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000224 | Grad Max: 0.000761
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000490 | Grad Max: 0.001562
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005856 | Grad Max: 0.005856
[GRADIENT NORM TOTAL] 1.7188

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.521
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50913763 0.49086234] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.538 | Std: 0.045
[MASKS] A(Pass/Fail): 633/1415 | B: 494/1362 | C: 266/1782
[LOSS Ex1] A: 0.65749 | B: 0.65823 | C: 0.65214
[LOGITS Ex2 A] Mean Abs: 1.774 | Max: 6.798
[LOSS Ex2] A: 0.17036 | B: 0.36847 | C: 0.29542
** [JOINT LOSS] ** : 0.934036
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003736 | Grad Max: 0.148007
  -> Layer: shared_layers.0.bias | Grad Mean: 0.100224 | Grad Max: 0.501254
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.007501
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006337 | Grad Max: 0.006337
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000729 | Grad Max: 0.100359
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012936 | Grad Max: 0.552802
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.004419
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005564 | Grad Max: 0.032200
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000222
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001261 | Grad Max: 0.003261
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000112
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000374 | Grad Max: 0.000982
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000502 | Grad Max: 0.001809
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008883 | Grad Max: 0.008883
[GRADIENT NORM TOTAL] 2.1737

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.538
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50171363 0.4982864 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.538 | Std: 0.045
[MASKS] A(Pass/Fail): 636/1412 | B: 515/1533 | C: 281/1767
[LOSS Ex1] A: 0.65646 | B: 0.65747 | C: 0.65300
[LOGITS Ex2 A] Mean Abs: 1.743 | Max: 6.800
[LOSS Ex2] A: 0.17852 | B: 0.39634 | C: 0.30336
** [JOINT LOSS] ** : 0.948383
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003174 | Grad Max: 0.075616
  -> Layer: shared_layers.0.bias | Grad Mean: 0.224632 | Grad Max: 0.822249
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.007406
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002874 | Grad Max: 0.002874
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001373 | Grad Max: 0.237133
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025856 | Grad Max: 1.320731
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000244 | Grad Max: 0.009497
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012290 | Grad Max: 0.064077
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000410
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002643 | Grad Max: 0.006447
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000207
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000770 | Grad Max: 0.001865
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001088 | Grad Max: 0.002171
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019009 | Grad Max: 0.019009
[GRADIENT NORM TOTAL] 4.4213

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.450
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040556  0.49594438] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.536 | Std: 0.044
[MASKS] A(Pass/Fail): 608/1440 | B: 530/1518 | C: 258/1790
[LOSS Ex1] A: 0.66247 | B: 0.65811 | C: 0.65574
[LOGITS Ex2 A] Mean Abs: 1.742 | Max: 5.857
[LOSS Ex2] A: 0.17921 | B: 0.40000 | C: 0.30683
** [JOINT LOSS] ** : 0.954122
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005287 | Grad Max: 0.200987
  -> Layer: shared_layers.0.bias | Grad Mean: 0.176547 | Grad Max: 0.657440
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.006185
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000866 | Grad Max: 0.000866
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001256 | Grad Max: 0.189459
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022745 | Grad Max: 1.036427
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000218 | Grad Max: 0.005712
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010452 | Grad Max: 0.040634
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000418
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002376 | Grad Max: 0.005451
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000170
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000705 | Grad Max: 0.001772
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001093 | Grad Max: 0.002179
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018207 | Grad Max: 0.018207
[GRADIENT NORM TOTAL] 3.4268

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.378
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5328027  0.46719727] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.043
[MASKS] A(Pass/Fail): 604/1444 | B: 531/1517 | C: 285/1763
[LOSS Ex1] A: 0.66220 | B: 0.65463 | C: 0.65179
[LOGITS Ex2 A] Mean Abs: 1.750 | Max: 6.529
[LOSS Ex2] A: 0.18810 | B: 0.37590 | C: 0.30375
** [JOINT LOSS] ** : 0.945456
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004526 | Grad Max: 0.157552
  -> Layer: shared_layers.0.bias | Grad Mean: 0.325635 | Grad Max: 1.761979
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.006761
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001498 | Grad Max: 0.001498
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.254898
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038141 | Grad Max: 1.436574
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000344 | Grad Max: 0.013436
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017069 | Grad Max: 0.097664
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000525
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003533 | Grad Max: 0.007712
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000232
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001014 | Grad Max: 0.002674
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001152 | Grad Max: 0.002696
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023235 | Grad Max: 0.023235
[GRADIENT NORM TOTAL] 6.8218

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.512
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6533809  0.34661916] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.045
[MASKS] A(Pass/Fail): 674/1374 | B: 494/1362 | C: 298/1750
[LOSS Ex1] A: 0.65804 | B: 0.65799 | C: 0.64955
[LOGITS Ex2 A] Mean Abs: 1.811 | Max: 6.428
[LOSS Ex2] A: 0.16804 | B: 0.38407 | C: 0.29628
** [JOINT LOSS] ** : 0.937991
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004193 | Grad Max: 0.132219
  -> Layer: shared_layers.0.bias | Grad Mean: 0.287439 | Grad Max: 1.408444
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.007067
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003039 | Grad Max: 0.003039
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001964 | Grad Max: 0.246729
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035009 | Grad Max: 1.390565
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000313 | Grad Max: 0.013052
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015431 | Grad Max: 0.090907
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000482
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003201 | Grad Max: 0.007064
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000195
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000919 | Grad Max: 0.002115
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001135 | Grad Max: 0.002449
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021949 | Grad Max: 0.021949
[GRADIENT NORM TOTAL] 6.1488

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.576
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50051105 0.49948892] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.044
[MASKS] A(Pass/Fail): 648/1400 | B: 517/1531 | C: 281/1767
[LOSS Ex1] A: 0.66283 | B: 0.65723 | C: 0.65252
[LOGITS Ex2 A] Mean Abs: 1.793 | Max: 6.365
[LOSS Ex2] A: 0.17454 | B: 0.38762 | C: 0.30392
** [JOINT LOSS] ** : 0.946222
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004869 | Grad Max: 0.225349
  -> Layer: shared_layers.0.bias | Grad Mean: 0.127665 | Grad Max: 0.488586
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.006404
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003277 | Grad Max: 0.003277
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001056 | Grad Max: 0.104239
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016841 | Grad Max: 0.571942
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000121 | Grad Max: 0.006206
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005011 | Grad Max: 0.040804
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000219
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000817 | Grad Max: 0.003009
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000077
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000220 | Grad Max: 0.000675
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000300 | Grad Max: 0.001171
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004760 | Grad Max: 0.004760
[GRADIENT NORM TOTAL] 2.6807

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.370
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62610507 0.3738949 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.045
[MASKS] A(Pass/Fail): 630/1418 | B: 532/1516 | C: 162/1214
[LOSS Ex1] A: 0.66003 | B: 0.65789 | C: 0.65773
[LOGITS Ex2 A] Mean Abs: 1.756 | Max: 6.244
[LOSS Ex2] A: 0.18525 | B: 0.40048 | C: 0.30867
** [JOINT LOSS] ** : 0.956686
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007315 | Grad Max: 0.200693
  -> Layer: shared_layers.0.bias | Grad Mean: 0.361703 | Grad Max: 1.535373
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.006608
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008955 | Grad Max: 0.008955
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002431 | Grad Max: 0.255090
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044959 | Grad Max: 1.412108
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000442 | Grad Max: 0.012128
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021788 | Grad Max: 0.089359
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000618
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004883 | Grad Max: 0.009995
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000312
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001427 | Grad Max: 0.003308
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001900 | Grad Max: 0.003408
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034456 | Grad Max: 0.034456
[GRADIENT NORM TOTAL] 7.0633

[EPOCH SUMMARY] Train Loss: 0.9451

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9278 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 80/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.446
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.57988304 0.420117  ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.045
[MASKS] A(Pass/Fail): 520/1096 | B: 532/1516 | C: 278/1770
[LOSS Ex1] A: 0.65874 | B: 0.65441 | C: 0.65175
[LOGITS Ex2 A] Mean Abs: 1.792 | Max: 5.665
[LOSS Ex2] A: 0.15899 | B: 0.38081 | C: 0.29643
** [JOINT LOSS] ** : 0.933709
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003812 | Grad Max: 0.106518
  -> Layer: shared_layers.0.bias | Grad Mean: 0.318918 | Grad Max: 1.383822
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002209 | Grad Max: 0.006912
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005369 | Grad Max: 0.005369
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.230262
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037871 | Grad Max: 1.303062
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000372 | Grad Max: 0.012121
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018615 | Grad Max: 0.093161
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000585
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004030 | Grad Max: 0.009030
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000249
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001161 | Grad Max: 0.002698
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001553 | Grad Max: 0.003124
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027830 | Grad Max: 0.027830
[GRADIENT NORM TOTAL] 6.4406

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.577
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.506386 0.493614] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.538 | Std: 0.046
[MASKS] A(Pass/Fail): 650/1398 | B: 494/1362 | C: 278/1770
[LOSS Ex1] A: 0.65899 | B: 0.65779 | C: 0.65186
[LOGITS Ex2 A] Mean Abs: 1.787 | Max: 7.330
[LOSS Ex2] A: 0.18021 | B: 0.37023 | C: 0.28497
** [JOINT LOSS] ** : 0.934685
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005131 | Grad Max: 0.251746
  -> Layer: shared_layers.0.bias | Grad Mean: 0.074662 | Grad Max: 0.313132
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.006887
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004332 | Grad Max: 0.004332
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000830 | Grad Max: 0.111386
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012817 | Grad Max: 0.591660
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000081 | Grad Max: 0.004387
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002744 | Grad Max: 0.023988
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000215
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000714 | Grad Max: 0.002621
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000089
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000233 | Grad Max: 0.000769
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000425 | Grad Max: 0.001531
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006278 | Grad Max: 0.006278
[GRADIENT NORM TOTAL] 2.1594

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.527
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5092505  0.49074948] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.538 | Std: 0.045
[MASKS] A(Pass/Fail): 635/1413 | B: 518/1530 | C: 277/1771
[LOSS Ex1] A: 0.65708 | B: 0.65703 | C: 0.65397
[LOGITS Ex2 A] Mean Abs: 1.808 | Max: 6.445
[LOSS Ex2] A: 0.18217 | B: 0.39684 | C: 0.29378
** [JOINT LOSS] ** : 0.946956
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007007 | Grad Max: 0.234749
  -> Layer: shared_layers.0.bias | Grad Mean: 0.252796 | Grad Max: 0.845870
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002373 | Grad Max: 0.008229
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015435 | Grad Max: 0.015435
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001791 | Grad Max: 0.168842
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032844 | Grad Max: 0.918997
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000322 | Grad Max: 0.009553
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015619 | Grad Max: 0.066981
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000556
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003471 | Grad Max: 0.008061
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000225
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001001 | Grad Max: 0.002398
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001386 | Grad Max: 0.002717
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024191 | Grad Max: 0.024191
[GRADIENT NORM TOTAL] 4.8399

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.544
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017686 0.4982314] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.538 | Std: 0.045
[MASKS] A(Pass/Fail): 638/1410 | B: 532/1516 | C: 252/1796
[LOSS Ex1] A: 0.65607 | B: 0.65769 | C: 0.65579
[LOGITS Ex2 A] Mean Abs: 1.785 | Max: 7.376
[LOSS Ex2] A: 0.18944 | B: 0.39178 | C: 0.30104
** [JOINT LOSS] ** : 0.950600
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003959 | Grad Max: 0.136602
  -> Layer: shared_layers.0.bias | Grad Mean: 0.054338 | Grad Max: 0.253010
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.007230
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002236 | Grad Max: 0.002236
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000610 | Grad Max: 0.070444
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009009 | Grad Max: 0.364294
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.003610
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001635 | Grad Max: 0.019051
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000174
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000274 | Grad Max: 0.002101
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000069
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000073 | Grad Max: 0.000472
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000207 | Grad Max: 0.000965
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000613 | Grad Max: 0.000613
[GRADIENT NORM TOTAL] 1.4702

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.455
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040564  0.49594364] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.045
[MASKS] A(Pass/Fail): 612/1436 | B: 533/1515 | C: 270/1778
[LOSS Ex1] A: 0.66213 | B: 0.65420 | C: 0.65337
[LOGITS Ex2 A] Mean Abs: 1.757 | Max: 6.065
[LOSS Ex2] A: 0.17243 | B: 0.38750 | C: 0.29575
** [JOINT LOSS] ** : 0.941794
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005463 | Grad Max: 0.142037
  -> Layer: shared_layers.0.bias | Grad Mean: 0.249711 | Grad Max: 1.025673
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002083 | Grad Max: 0.006027
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003902 | Grad Max: 0.003902
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001777 | Grad Max: 0.275897
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032574 | Grad Max: 1.529553
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.007938
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015308 | Grad Max: 0.060142
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000473
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003434 | Grad Max: 0.007402
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000253
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000987 | Grad Max: 0.002747
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001331 | Grad Max: 0.002651
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023474 | Grad Max: 0.023474
[GRADIENT NORM TOTAL] 5.1663

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.383
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53318006 0.4668199 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.536 | Std: 0.044
[MASKS] A(Pass/Fail): 605/1443 | B: 496/1360 | C: 289/1759
[LOSS Ex1] A: 0.66187 | B: 0.65759 | C: 0.65326
[LOGITS Ex2 A] Mean Abs: 1.749 | Max: 6.934
[LOSS Ex2] A: 0.18677 | B: 0.36922 | C: 0.33461
** [JOINT LOSS] ** : 0.954435
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005685 | Grad Max: 0.217202
  -> Layer: shared_layers.0.bias | Grad Mean: 0.080434 | Grad Max: 0.335134
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002026 | Grad Max: 0.006257
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002974 | Grad Max: 0.002974
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000836 | Grad Max: 0.151741
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013534 | Grad Max: 0.855128
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000110 | Grad Max: 0.003789
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004704 | Grad Max: 0.022593
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000332
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001211 | Grad Max: 0.003537
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000142
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000370 | Grad Max: 0.001260
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000607 | Grad Max: 0.001497
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009672 | Grad Max: 0.009672
[GRADIENT NORM TOTAL] 2.2377

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.518
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.65540504 0.34459496] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 676/1372 | B: 519/1529 | C: 280/1768
[LOSS Ex1] A: 0.65766 | B: 0.65682 | C: 0.65187
[LOGITS Ex2 A] Mean Abs: 1.824 | Max: 6.250
[LOSS Ex2] A: 0.17724 | B: 0.40505 | C: 0.30787
** [JOINT LOSS] ** : 0.952171
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003798 | Grad Max: 0.130572
  -> Layer: shared_layers.0.bias | Grad Mean: 0.364875 | Grad Max: 1.662331
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002217 | Grad Max: 0.007397
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005895 | Grad Max: 0.005895
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002376 | Grad Max: 0.233082
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043714 | Grad Max: 1.272204
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000417 | Grad Max: 0.017302
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020948 | Grad Max: 0.123132
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000676
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004394 | Grad Max: 0.009941
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000254
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001247 | Grad Max: 0.002953
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001570 | Grad Max: 0.002875
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029652 | Grad Max: 0.029652
[GRADIENT NORM TOTAL] 7.5343

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.582
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005553  0.49944472] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.538 | Std: 0.045
[MASKS] A(Pass/Fail): 650/1398 | B: 534/1514 | C: 281/1767
[LOSS Ex1] A: 0.66250 | B: 0.65749 | C: 0.65092
[LOGITS Ex2 A] Mean Abs: 1.804 | Max: 5.961
[LOSS Ex2] A: 0.17371 | B: 0.40010 | C: 0.30225
** [JOINT LOSS] ** : 0.948993
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004777 | Grad Max: 0.175150
  -> Layer: shared_layers.0.bias | Grad Mean: 0.413692 | Grad Max: 2.114551
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002046 | Grad Max: 0.005867
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004508 | Grad Max: 0.004508
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002640 | Grad Max: 0.282084
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047429 | Grad Max: 1.577381
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000432 | Grad Max: 0.015966
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021626 | Grad Max: 0.122072
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000679
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004451 | Grad Max: 0.010409
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000275
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001259 | Grad Max: 0.002901
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001502 | Grad Max: 0.003280
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028970 | Grad Max: 0.028970
[GRADIENT NORM TOTAL] 8.4697

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.375
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62770534 0.37229466] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.538 | Std: 0.046
[MASKS] A(Pass/Fail): 637/1411 | B: 535/1513 | C: 263/1785
[LOSS Ex1] A: 0.65968 | B: 0.65399 | C: 0.65401
[LOGITS Ex2 A] Mean Abs: 1.786 | Max: 6.067
[LOSS Ex2] A: 0.19398 | B: 0.37597 | C: 0.30405
** [JOINT LOSS] ** : 0.947222
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003233 | Grad Max: 0.119968
  -> Layer: shared_layers.0.bias | Grad Mean: 0.125490 | Grad Max: 0.544307
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.006798
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000121 | Grad Max: 0.000121
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001010 | Grad Max: 0.143216
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017047 | Grad Max: 0.802015
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.006388
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005730 | Grad Max: 0.044852
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000237
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001077 | Grad Max: 0.003231
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000295 | Grad Max: 0.000958
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000325 | Grad Max: 0.001218
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006754 | Grad Max: 0.006754
[GRADIENT NORM TOTAL] 2.8616

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.451
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5809616  0.41903844] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 524/1092 | B: 502/1354 | C: 268/1780
[LOSS Ex1] A: 0.65837 | B: 0.65740 | C: 0.65142
[LOGITS Ex2 A] Mean Abs: 1.788 | Max: 5.890
[LOSS Ex2] A: 0.16700 | B: 0.38268 | C: 0.29953
** [JOINT LOSS] ** : 0.938797
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006325 | Grad Max: 0.169389
  -> Layer: shared_layers.0.bias | Grad Mean: 0.379571 | Grad Max: 1.499977
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.006777
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001250 | Grad Max: 0.001250
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002442 | Grad Max: 0.300442
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045395 | Grad Max: 1.641799
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000433 | Grad Max: 0.014549
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021752 | Grad Max: 0.108752
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000647
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004796 | Grad Max: 0.010372
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000295
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001378 | Grad Max: 0.003165
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001746 | Grad Max: 0.003253
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032348 | Grad Max: 0.032348
[GRADIENT NORM TOTAL] 7.3708

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.584
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063782 0.4936218] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 653/1395 | B: 531/1517 | C: 292/1756
[LOSS Ex1] A: 0.65863 | B: 0.65663 | C: 0.65051
[LOGITS Ex2 A] Mean Abs: 1.758 | Max: 6.218
[LOSS Ex2] A: 0.17258 | B: 0.41642 | C: 0.30186
** [JOINT LOSS] ** : 0.952209
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005457 | Grad Max: 0.150562
  -> Layer: shared_layers.0.bias | Grad Mean: 0.467417 | Grad Max: 1.979501
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.007050
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003038 | Grad Max: 0.003038
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002897 | Grad Max: 0.362920
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054374 | Grad Max: 2.054542
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000523 | Grad Max: 0.017443
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026541 | Grad Max: 0.122004
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000729
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005777 | Grad Max: 0.011777
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000350
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001666 | Grad Max: 0.003949
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002056 | Grad Max: 0.003977
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039571 | Grad Max: 0.039571
[GRADIENT NORM TOTAL] 9.2950

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.533
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50946397 0.49053603] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 637/1411 | B: 548/1500 | C: 276/1772
[LOSS Ex1] A: 0.65668 | B: 0.65731 | C: 0.65248
[LOGITS Ex2 A] Mean Abs: 1.776 | Max: 6.196
[LOSS Ex2] A: 0.17139 | B: 0.39697 | C: 0.30910
** [JOINT LOSS] ** : 0.947977
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004150 | Grad Max: 0.117199
  -> Layer: shared_layers.0.bias | Grad Mean: 0.223697 | Grad Max: 0.991245
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.006833
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000541 | Grad Max: 0.000541
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001496 | Grad Max: 0.201495
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026308 | Grad Max: 1.124585
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000216 | Grad Max: 0.007979
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010737 | Grad Max: 0.056171
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000382
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002223 | Grad Max: 0.005277
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000155
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000646 | Grad Max: 0.001609
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000800 | Grad Max: 0.001830
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015819 | Grad Max: 0.015819
[GRADIENT NORM TOTAL] 4.6066

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.550
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017539  0.49824604] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 639/1409 | B: 545/1503 | C: 272/1776
[LOSS Ex1] A: 0.65567 | B: 0.65380 | C: 0.65229
[LOGITS Ex2 A] Mean Abs: 1.795 | Max: 7.149
[LOSS Ex2] A: 0.20459 | B: 0.37618 | C: 0.31133
** [JOINT LOSS] ** : 0.951289
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010196 | Grad Max: 0.377459
  -> Layer: shared_layers.0.bias | Grad Mean: 0.380662 | Grad Max: 1.676027
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002304 | Grad Max: 0.007946
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006979 | Grad Max: 0.006979
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002804 | Grad Max: 0.259710
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050634 | Grad Max: 1.343517
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000492 | Grad Max: 0.013404
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024100 | Grad Max: 0.102511
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000809
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005502 | Grad Max: 0.012403
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000330
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001617 | Grad Max: 0.003794
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002184 | Grad Max: 0.004465
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039591 | Grad Max: 0.039591
[GRADIENT NORM TOTAL] 7.4215

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.460
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5039963 0.4960037] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.045
[MASKS] A(Pass/Fail): 617/1431 | B: 504/1352 | C: 194/1182
[LOSS Ex1] A: 0.66182 | B: 0.65723 | C: 0.65004
[LOGITS Ex2 A] Mean Abs: 1.788 | Max: 6.353
[LOSS Ex2] A: 0.17985 | B: 0.36824 | C: 0.28656
** [JOINT LOSS] ** : 0.934576
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009592 | Grad Max: 0.302949
  -> Layer: shared_layers.0.bias | Grad Mean: 0.384959 | Grad Max: 1.623849
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.006001
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000258 | Grad Max: 0.000258
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002698 | Grad Max: 0.294098
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048885 | Grad Max: 1.511602
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000477 | Grad Max: 0.013335
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023512 | Grad Max: 0.101377
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000753
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005312 | Grad Max: 0.011197
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000317
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001554 | Grad Max: 0.003447
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001941 | Grad Max: 0.004072
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036445 | Grad Max: 0.036445
[GRADIENT NORM TOTAL] 7.4543

[EPOCH SUMMARY] Train Loss: 0.9454

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9219 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9263 -> New: 0.9219)

############################## EPOCH 81/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.389
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5335998 0.4664002] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.044
[MASKS] A(Pass/Fail): 606/1442 | B: 534/1514 | C: 273/1775
[LOSS Ex1] A: 0.66156 | B: 0.65647 | C: 0.65295
[LOGITS Ex2 A] Mean Abs: 1.716 | Max: 6.033
[LOSS Ex2] A: 0.18334 | B: 0.39734 | C: 0.32704
** [JOINT LOSS] ** : 0.959564
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006662 | Grad Max: 0.204318
  -> Layer: shared_layers.0.bias | Grad Mean: 0.153545 | Grad Max: 0.655684
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.006340
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003585 | Grad Max: 0.003585
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001249 | Grad Max: 0.117567
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022055 | Grad Max: 0.628844
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000205 | Grad Max: 0.005546
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009831 | Grad Max: 0.036649
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000422
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002287 | Grad Max: 0.005443
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000164
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000666 | Grad Max: 0.001698
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000869 | Grad Max: 0.001855
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015670 | Grad Max: 0.015670
[GRADIENT NORM TOTAL] 3.0837

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.523
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6573193  0.34268063] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.046
[MASKS] A(Pass/Fail): 679/1369 | B: 550/1498 | C: 282/1766
[LOSS Ex1] A: 0.65732 | B: 0.65716 | C: 0.65029
[LOGITS Ex2 A] Mean Abs: 1.720 | Max: 6.254
[LOSS Ex2] A: 0.17187 | B: 0.41470 | C: 0.27924
** [JOINT LOSS] ** : 0.943525
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005954 | Grad Max: 0.168725
  -> Layer: shared_layers.0.bias | Grad Mean: 0.510062 | Grad Max: 2.350912
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002214 | Grad Max: 0.006980
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006036 | Grad Max: 0.006036
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003226 | Grad Max: 0.356363
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.061098 | Grad Max: 2.002646
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000607 | Grad Max: 0.018669
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030785 | Grad Max: 0.140951
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000852
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006671 | Grad Max: 0.014129
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000391
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001906 | Grad Max: 0.004507
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002274 | Grad Max: 0.003838
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043489 | Grad Max: 0.043489
[GRADIENT NORM TOTAL] 10.0719

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.587
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006303  0.49936968] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.538 | Std: 0.046
[MASKS] A(Pass/Fail): 653/1395 | B: 546/1502 | C: 308/1740
[LOSS Ex1] A: 0.66221 | B: 0.65364 | C: 0.64951
[LOGITS Ex2 A] Mean Abs: 1.709 | Max: 5.831
[LOSS Ex2] A: 0.18140 | B: 0.40672 | C: 0.31290
** [JOINT LOSS] ** : 0.955462
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012038 | Grad Max: 0.333008
  -> Layer: shared_layers.0.bias | Grad Mean: 0.687403 | Grad Max: 3.082189
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.006643
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006442 | Grad Max: 0.006442
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004619 | Grad Max: 0.439915
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.086036 | Grad Max: 2.304513
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000856 | Grad Max: 0.025002
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.042994 | Grad Max: 0.196189
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000111 | Grad Max: 0.001231
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009510 | Grad Max: 0.019413
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000525
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002747 | Grad Max: 0.006288
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003361 | Grad Max: 0.006337
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.064008 | Grad Max: 0.064008
[GRADIENT NORM TOTAL] 13.3274

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.380
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62911975 0.37088025] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.538 | Std: 0.046
[MASKS] A(Pass/Fail): 637/1411 | B: 504/1352 | C: 248/1800
[LOSS Ex1] A: 0.65937 | B: 0.65708 | C: 0.65271
[LOGITS Ex2 A] Mean Abs: 1.706 | Max: 5.993
[LOSS Ex2] A: 0.19274 | B: 0.38035 | C: 0.30062
** [JOINT LOSS] ** : 0.947623
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009145 | Grad Max: 0.266270
  -> Layer: shared_layers.0.bias | Grad Mean: 0.495783 | Grad Max: 2.127459
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002145 | Grad Max: 0.007262
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004976 | Grad Max: 0.004976
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003243 | Grad Max: 0.283185
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060574 | Grad Max: 1.393398
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000608 | Grad Max: 0.019339
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030545 | Grad Max: 0.140382
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000872
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006732 | Grad Max: 0.013879
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000422
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001926 | Grad Max: 0.004537
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002324 | Grad Max: 0.003990
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043689 | Grad Max: 0.043689
[GRADIENT NORM TOTAL] 9.3145

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.455
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5818856  0.41811445] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.046
[MASKS] A(Pass/Fail): 526/1090 | B: 534/1514 | C: 292/1756
[LOSS Ex1] A: 0.65806 | B: 0.65632 | C: 0.64982
[LOGITS Ex2 A] Mean Abs: 1.765 | Max: 5.954
[LOSS Ex2] A: 0.17247 | B: 0.38988 | C: 0.27920
** [JOINT LOSS] ** : 0.935255
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005890 | Grad Max: 0.199373
  -> Layer: shared_layers.0.bias | Grad Mean: 0.092362 | Grad Max: 0.485583
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.006604
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003541 | Grad Max: 0.003541
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000830 | Grad Max: 0.074697
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011838 | Grad Max: 0.398093
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.003643
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002247 | Grad Max: 0.020341
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000237
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000485 | Grad Max: 0.002336
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000086
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000149 | Grad Max: 0.000626
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000510 | Grad Max: 0.001616
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004009 | Grad Max: 0.004009
[GRADIENT NORM TOTAL] 2.0417

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.589
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5064215  0.49357846] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.047
[MASKS] A(Pass/Fail): 654/1394 | B: 550/1498 | C: 270/1778
[LOSS Ex1] A: 0.65834 | B: 0.65702 | C: 0.65164
[LOGITS Ex2 A] Mean Abs: 1.793 | Max: 6.515
[LOSS Ex2] A: 0.17342 | B: 0.40667 | C: 0.31212
** [JOINT LOSS] ** : 0.953069
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005190 | Grad Max: 0.202632
  -> Layer: shared_layers.0.bias | Grad Mean: 0.558156 | Grad Max: 2.733904
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.006709
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001161 | Grad Max: 0.001161
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003437 | Grad Max: 0.356448
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064283 | Grad Max: 2.029707
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000612 | Grad Max: 0.021178
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031325 | Grad Max: 0.160414
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000928
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006791 | Grad Max: 0.014095
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000367
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001972 | Grad Max: 0.004459
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002354 | Grad Max: 0.004395
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046288 | Grad Max: 0.046288
[GRADIENT NORM TOTAL] 11.2901

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.538
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50961715 0.49038285] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 643/1405 | B: 548/1500 | C: 275/1773
[LOSS Ex1] A: 0.65638 | B: 0.65349 | C: 0.65249
[LOGITS Ex2 A] Mean Abs: 1.800 | Max: 5.580
[LOSS Ex2] A: 0.18451 | B: 0.41337 | C: 0.31277
** [JOINT LOSS] ** : 0.957665
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009634 | Grad Max: 0.242697
  -> Layer: shared_layers.0.bias | Grad Mean: 0.728804 | Grad Max: 3.301135
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.007279
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004122 | Grad Max: 0.004122
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004575 | Grad Max: 0.506763
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.086158 | Grad Max: 2.765893
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000818 | Grad Max: 0.025991
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.041552 | Grad Max: 0.196951
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001241
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009075 | Grad Max: 0.019124
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000045 | Grad Max: 0.000509
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002619 | Grad Max: 0.006123
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.003188 | Grad Max: 0.006005
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.060739 | Grad Max: 0.060739
[GRADIENT NORM TOTAL] 14.3447

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.554
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50176126 0.4982388 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 640/1408 | B: 505/1351 | C: 260/1788
[LOSS Ex1] A: 0.65537 | B: 0.65695 | C: 0.65308
[LOGITS Ex2 A] Mean Abs: 1.788 | Max: 5.817
[LOSS Ex2] A: 0.20228 | B: 0.37004 | C: 0.29642
** [JOINT LOSS] ** : 0.944716
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009152 | Grad Max: 0.239818
  -> Layer: shared_layers.0.bias | Grad Mean: 0.554160 | Grad Max: 2.352176
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.007452
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004804 | Grad Max: 0.004804
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003715 | Grad Max: 0.355053
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069262 | Grad Max: 1.901276
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000659 | Grad Max: 0.020432
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033139 | Grad Max: 0.152170
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000926
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007333 | Grad Max: 0.014796
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000407
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002150 | Grad Max: 0.004786
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002676 | Grad Max: 0.005292
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051279 | Grad Max: 0.051279
[GRADIENT NORM TOTAL] 10.9719

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.463
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50400186 0.49599817] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.537 | Std: 0.045
[MASKS] A(Pass/Fail): 622/1426 | B: 535/1513 | C: 280/1768
[LOSS Ex1] A: 0.66158 | B: 0.65619 | C: 0.65250
[LOGITS Ex2 A] Mean Abs: 1.725 | Max: 5.423
[LOSS Ex2] A: 0.16935 | B: 0.39359 | C: 0.30681
** [JOINT LOSS] ** : 0.946677
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004314 | Grad Max: 0.150054
  -> Layer: shared_layers.0.bias | Grad Mean: 0.060726 | Grad Max: 0.388063
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002011 | Grad Max: 0.006066
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004864 | Grad Max: 0.004864
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000566 | Grad Max: 0.077248
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008552 | Grad Max: 0.428223
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.003322
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001659 | Grad Max: 0.018447
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000163
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000300 | Grad Max: 0.001628
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000097 | Grad Max: 0.000591
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000368 | Grad Max: 0.001079
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002054 | Grad Max: 0.002054
[GRADIENT NORM TOTAL] 1.5528

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.392
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53387016 0.46612987] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.044
[MASKS] A(Pass/Fail): 610/1438 | B: 551/1497 | C: 280/1768
[LOSS Ex1] A: 0.66132 | B: 0.65689 | C: 0.65225
[LOGITS Ex2 A] Mean Abs: 1.656 | Max: 5.785
[LOSS Ex2] A: 0.18715 | B: 0.40788 | C: 0.29706
** [JOINT LOSS] ** : 0.954182
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006167 | Grad Max: 0.162745
  -> Layer: shared_layers.0.bias | Grad Mean: 0.485147 | Grad Max: 2.235384
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002032 | Grad Max: 0.006252
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000606 | Grad Max: 0.000606
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003055 | Grad Max: 0.257413
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057893 | Grad Max: 1.454363
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000575 | Grad Max: 0.016645
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029197 | Grad Max: 0.136177
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000839
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006349 | Grad Max: 0.013603
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000396
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001822 | Grad Max: 0.004483
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002183 | Grad Max: 0.003967
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042400 | Grad Max: 0.042400
[GRADIENT NORM TOTAL] 9.4466

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.526
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.65864146 0.34135848] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.047
[MASKS] A(Pass/Fail): 680/1368 | B: 549/1499 | C: 289/1759
[LOSS Ex1] A: 0.65706 | B: 0.65336 | C: 0.65015
[LOGITS Ex2 A] Mean Abs: 1.695 | Max: 6.947
[LOSS Ex2] A: 0.17090 | B: 0.39574 | C: 0.30140
** [JOINT LOSS] ** : 0.942869
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006457 | Grad Max: 0.160879
  -> Layer: shared_layers.0.bias | Grad Mean: 0.496281 | Grad Max: 2.272225
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002350 | Grad Max: 0.007292
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006852 | Grad Max: 0.006852
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003161 | Grad Max: 0.339691
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059380 | Grad Max: 1.898198
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000570 | Grad Max: 0.017577
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028923 | Grad Max: 0.141136
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000871
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006333 | Grad Max: 0.013727
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000415
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001814 | Grad Max: 0.004707
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002093 | Grad Max: 0.003605
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040573 | Grad Max: 0.040573
[GRADIENT NORM TOTAL] 9.7704

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.591
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50052077 0.49947926] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 656/1392 | B: 506/1350 | C: 272/1776
[LOSS Ex1] A: 0.66199 | B: 0.65682 | C: 0.65269
[LOGITS Ex2 A] Mean Abs: 1.720 | Max: 5.857
[LOSS Ex2] A: 0.16522 | B: 0.37229 | C: 0.30242
** [JOINT LOSS] ** : 0.937140
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005655 | Grad Max: 0.153572
  -> Layer: shared_layers.0.bias | Grad Mean: 0.314178 | Grad Max: 1.422749
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.006150
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003119 | Grad Max: 0.003119
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.225124
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038147 | Grad Max: 1.268969
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000371 | Grad Max: 0.011168
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018709 | Grad Max: 0.085546
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000640
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004124 | Grad Max: 0.009411
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000298
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001176 | Grad Max: 0.003369
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001396 | Grad Max: 0.002995
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026654 | Grad Max: 0.026654
[GRADIENT NORM TOTAL] 6.0698

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.382
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6299276  0.37007245] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 637/1411 | B: 535/1513 | C: 265/1783
[LOSS Ex1] A: 0.65913 | B: 0.65605 | C: 0.65250
[LOGITS Ex2 A] Mean Abs: 1.756 | Max: 5.968
[LOSS Ex2] A: 0.17657 | B: 0.39061 | C: 0.29676
** [JOINT LOSS] ** : 0.943877
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002803 | Grad Max: 0.085848
  -> Layer: shared_layers.0.bias | Grad Mean: 0.253659 | Grad Max: 1.076225
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.006763
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002749 | Grad Max: 0.002749
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001638 | Grad Max: 0.178319
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030474 | Grad Max: 1.004703
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.010435
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015485 | Grad Max: 0.077942
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000509
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003326 | Grad Max: 0.007456
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000200
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000957 | Grad Max: 0.002166
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001118 | Grad Max: 0.002863
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022188 | Grad Max: 0.022188
[GRADIENT NORM TOTAL] 5.0779

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.458
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58232754 0.4176725 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.047
[MASKS] A(Pass/Fail): 526/1090 | B: 553/1495 | C: 180/1196
[LOSS Ex1] A: 0.65781 | B: 0.65675 | C: 0.65328
[LOGITS Ex2 A] Mean Abs: 1.810 | Max: 5.680
[LOSS Ex2] A: 0.16449 | B: 0.39501 | C: 0.30016
** [JOINT LOSS] ** : 0.942499
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004469 | Grad Max: 0.136601
  -> Layer: shared_layers.0.bias | Grad Mean: 0.407912 | Grad Max: 1.738238
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.006476
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001340 | Grad Max: 0.001340
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002570 | Grad Max: 0.282283
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048243 | Grad Max: 1.595436
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000489 | Grad Max: 0.016911
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024875 | Grad Max: 0.124885
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000803
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005385 | Grad Max: 0.012196
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000310
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001558 | Grad Max: 0.003609
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001870 | Grad Max: 0.004041
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036221 | Grad Max: 0.036221
[GRADIENT NORM TOTAL] 8.2101

[EPOCH SUMMARY] Train Loss: 0.9474

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9240 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 82/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.592
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5065801  0.49341986] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.047
[MASKS] A(Pass/Fail): 654/1394 | B: 549/1499 | C: 270/1778
[LOSS Ex1] A: 0.65811 | B: 0.65322 | C: 0.65337
[LOGITS Ex2 A] Mean Abs: 1.778 | Max: 5.847
[LOSS Ex2] A: 0.16819 | B: 0.37460 | C: 0.29584
** [JOINT LOSS] ** : 0.934444
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002571 | Grad Max: 0.120144
  -> Layer: shared_layers.0.bias | Grad Mean: 0.285165 | Grad Max: 1.381527
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.007420
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009273 | Grad Max: 0.009273
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001812 | Grad Max: 0.179918
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033351 | Grad Max: 1.017690
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000353 | Grad Max: 0.012483
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018033 | Grad Max: 0.096063
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000513
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003891 | Grad Max: 0.008151
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000235
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001148 | Grad Max: 0.002598
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001390 | Grad Max: 0.003307
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027690 | Grad Max: 0.027690
[GRADIENT NORM TOTAL] 5.6790

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.541
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5095515  0.49044845] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.047
[MASKS] A(Pass/Fail): 645/1403 | B: 506/1350 | C: 291/1757
[LOSS Ex1] A: 0.65614 | B: 0.65669 | C: 0.65118
[LOGITS Ex2 A] Mean Abs: 1.727 | Max: 5.924
[LOSS Ex2] A: 0.17421 | B: 0.37842 | C: 0.30911
** [JOINT LOSS] ** : 0.941919
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004113 | Grad Max: 0.101901
  -> Layer: shared_layers.0.bias | Grad Mean: 0.268569 | Grad Max: 1.062521
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002245 | Grad Max: 0.007595
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007603 | Grad Max: 0.007603
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001639 | Grad Max: 0.306845
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030542 | Grad Max: 1.729981
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000263 | Grad Max: 0.008323
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013412 | Grad Max: 0.061444
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000516
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002960 | Grad Max: 0.006557
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000212
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000855 | Grad Max: 0.002264
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001062 | Grad Max: 0.002376
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020204 | Grad Max: 0.020204
[GRADIENT NORM TOTAL] 5.6833

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.558
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5019347  0.49806532] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 640/1408 | B: 535/1513 | C: 257/1791
[LOSS Ex1] A: 0.65513 | B: 0.65593 | C: 0.65289
[LOGITS Ex2 A] Mean Abs: 1.702 | Max: 6.407
[LOSS Ex2] A: 0.19064 | B: 0.41444 | C: 0.29686
** [JOINT LOSS] ** : 0.955299
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007640 | Grad Max: 0.225978
  -> Layer: shared_layers.0.bias | Grad Mean: 0.537965 | Grad Max: 2.274895
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002217 | Grad Max: 0.007554
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004831 | Grad Max: 0.004831
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003299 | Grad Max: 0.462809
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.061941 | Grad Max: 2.497517
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000575 | Grad Max: 0.019090
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029345 | Grad Max: 0.141659
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000861
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006452 | Grad Max: 0.012990
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000404
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001870 | Grad Max: 0.004502
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002232 | Grad Max: 0.004017
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043191 | Grad Max: 0.043191
[GRADIENT NORM TOTAL] 10.6399

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.466
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041577  0.49584228] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.538 | Std: 0.046
[MASKS] A(Pass/Fail): 623/1425 | B: 554/1494 | C: 280/1768
[LOSS Ex1] A: 0.66137 | B: 0.65664 | C: 0.65091
[LOGITS Ex2 A] Mean Abs: 1.686 | Max: 5.590
[LOSS Ex2] A: 0.16817 | B: 0.39703 | C: 0.28469
** [JOINT LOSS] ** : 0.939600
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007198 | Grad Max: 0.184984
  -> Layer: shared_layers.0.bias | Grad Mean: 0.439171 | Grad Max: 1.656231
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002002 | Grad Max: 0.006484
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007226 | Grad Max: 0.007226
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002783 | Grad Max: 0.392639
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052201 | Grad Max: 2.130070
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000510 | Grad Max: 0.015474
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025881 | Grad Max: 0.113705
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000786
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005693 | Grad Max: 0.013260
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000352
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001638 | Grad Max: 0.004158
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001968 | Grad Max: 0.003854
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037296 | Grad Max: 0.037296
[GRADIENT NORM TOTAL] 8.5420

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.395
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5339173  0.46608266] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.045
[MASKS] A(Pass/Fail): 611/1437 | B: 549/1499 | C: 288/1760
[LOSS Ex1] A: 0.66113 | B: 0.65310 | C: 0.65098
[LOGITS Ex2 A] Mean Abs: 1.712 | Max: 5.631
[LOSS Ex2] A: 0.17902 | B: 0.37392 | C: 0.30741
** [JOINT LOSS] ** : 0.941851
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002365 | Grad Max: 0.052858
  -> Layer: shared_layers.0.bias | Grad Mean: 0.148174 | Grad Max: 0.678899
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.006992
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009358 | Grad Max: 0.009358
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001035 | Grad Max: 0.127595
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019085 | Grad Max: 0.712854
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.006747
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008843 | Grad Max: 0.040413
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000295
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001918 | Grad Max: 0.004924
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000142
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000545 | Grad Max: 0.001358
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000591 | Grad Max: 0.001754
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011947 | Grad Max: 0.011947
[GRADIENT NORM TOTAL] 3.1570

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.529
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6596345  0.34036553] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.047
[MASKS] A(Pass/Fail): 681/1367 | B: 506/1350 | C: 288/1760
[LOSS Ex1] A: 0.65686 | B: 0.65658 | C: 0.64967
[LOGITS Ex2 A] Mean Abs: 1.768 | Max: 5.645
[LOSS Ex2] A: 0.16717 | B: 0.36816 | C: 0.30261
** [JOINT LOSS] ** : 0.933681
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004780 | Grad Max: 0.119598
  -> Layer: shared_layers.0.bias | Grad Mean: 0.276967 | Grad Max: 1.196343
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.006994
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004872 | Grad Max: 0.004872
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001902 | Grad Max: 0.193620
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035188 | Grad Max: 1.082307
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000323 | Grad Max: 0.011038
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016437 | Grad Max: 0.081377
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000547
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003665 | Grad Max: 0.007795
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000207
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001071 | Grad Max: 0.002599
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001337 | Grad Max: 0.003164
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025583 | Grad Max: 0.025583
[GRADIENT NORM TOTAL] 5.6112

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.594
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004416 0.4995584] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 656/1392 | B: 535/1513 | C: 295/1753
[LOSS Ex1] A: 0.66181 | B: 0.65581 | C: 0.65053
[LOGITS Ex2 A] Mean Abs: 1.746 | Max: 5.401
[LOSS Ex2] A: 0.15723 | B: 0.39486 | C: 0.28746
** [JOINT LOSS] ** : 0.935899
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001664 | Grad Max: 0.032129
  -> Layer: shared_layers.0.bias | Grad Mean: 0.025284 | Grad Max: 0.117491
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.006042
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001326 | Grad Max: 0.001326
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000359 | Grad Max: 0.060886
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005808 | Grad Max: 0.323983
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002608
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001487 | Grad Max: 0.011405
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000142
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000282 | Grad Max: 0.001852
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000074 | Grad Max: 0.000395
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000426 | Grad Max: 0.000968
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000678 | Grad Max: 0.000678
[GRADIENT NORM TOTAL] 0.9092

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.385
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63071036 0.36928967] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.047
[MASKS] A(Pass/Fail): 638/1410 | B: 556/1492 | C: 279/1769
[LOSS Ex1] A: 0.65892 | B: 0.65651 | C: 0.65055
[LOGITS Ex2 A] Mean Abs: 1.733 | Max: 5.937
[LOSS Ex2] A: 0.17962 | B: 0.38824 | C: 0.29462
** [JOINT LOSS] ** : 0.942819
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003933 | Grad Max: 0.092307
  -> Layer: shared_layers.0.bias | Grad Mean: 0.280926 | Grad Max: 1.261168
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.006211
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000546 | Grad Max: 0.000546
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001786 | Grad Max: 0.125419
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033547 | Grad Max: 0.706983
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.010552
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017070 | Grad Max: 0.081388
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000489
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003728 | Grad Max: 0.007781
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000228
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001071 | Grad Max: 0.002605
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001268 | Grad Max: 0.002569
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024460 | Grad Max: 0.024460
[GRADIENT NORM TOTAL] 5.2986

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.461
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58279115 0.41720888] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.047
[MASKS] A(Pass/Fail): 526/1090 | B: 551/1497 | C: 269/1779
[LOSS Ex1] A: 0.65759 | B: 0.65296 | C: 0.65332
[LOGITS Ex2 A] Mean Abs: 1.767 | Max: 5.667
[LOSS Ex2] A: 0.17001 | B: 0.37627 | C: 0.29533
** [JOINT LOSS] ** : 0.935160
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002647 | Grad Max: 0.085992
  -> Layer: shared_layers.0.bias | Grad Mean: 0.231150 | Grad Max: 1.094567
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.006859
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005953 | Grad Max: 0.005953
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001557 | Grad Max: 0.211276
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029016 | Grad Max: 1.189796
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.010034
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013965 | Grad Max: 0.068089
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000435
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003024 | Grad Max: 0.006864
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000191
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000878 | Grad Max: 0.002094
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001108 | Grad Max: 0.002507
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020441 | Grad Max: 0.020441
[GRADIENT NORM TOTAL] 4.8683

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.595
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5066391  0.49336094] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.047
[MASKS] A(Pass/Fail): 656/1392 | B: 506/1350 | C: 249/1799
[LOSS Ex1] A: 0.65789 | B: 0.65643 | C: 0.65249
[LOGITS Ex2 A] Mean Abs: 1.802 | Max: 5.996
[LOSS Ex2] A: 0.17782 | B: 0.36830 | C: 0.31688
** [JOINT LOSS] ** : 0.943269
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004236 | Grad Max: 0.096834
  -> Layer: shared_layers.0.bias | Grad Mean: 0.247909 | Grad Max: 1.074477
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.006610
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002928 | Grad Max: 0.002928
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001594 | Grad Max: 0.208197
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029353 | Grad Max: 1.144679
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000269 | Grad Max: 0.008641
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013693 | Grad Max: 0.068626
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000442
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003028 | Grad Max: 0.006962
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000217
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000876 | Grad Max: 0.002308
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001017 | Grad Max: 0.002309
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019686 | Grad Max: 0.019686
[GRADIENT NORM TOTAL] 5.0265

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.545
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5096312  0.49036878] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.047
[MASKS] A(Pass/Fail): 649/1399 | B: 535/1513 | C: 287/1761
[LOSS Ex1] A: 0.65589 | B: 0.65566 | C: 0.64894
[LOGITS Ex2 A] Mean Abs: 1.797 | Max: 5.219
[LOSS Ex2] A: 0.17316 | B: 0.39851 | C: 0.30095
** [JOINT LOSS] ** : 0.944370
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006472 | Grad Max: 0.192561
  -> Layer: shared_layers.0.bias | Grad Mean: 0.352406 | Grad Max: 1.554785
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.007657
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006991 | Grad Max: 0.006991
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.248939
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041080 | Grad Max: 1.235720
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000400 | Grad Max: 0.012793
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020303 | Grad Max: 0.090065
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000594
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004490 | Grad Max: 0.009589
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000261
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001289 | Grad Max: 0.003065
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001512 | Grad Max: 0.003144
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029106 | Grad Max: 0.029106
[GRADIENT NORM TOTAL] 6.7047

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.561
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.501988   0.49801198] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.047
[MASKS] A(Pass/Fail): 641/1407 | B: 556/1492 | C: 256/1792
[LOSS Ex1] A: 0.65488 | B: 0.65637 | C: 0.65228
[LOGITS Ex2 A] Mean Abs: 1.777 | Max: 7.485
[LOSS Ex2] A: 0.18218 | B: 0.38715 | C: 0.29815
** [JOINT LOSS] ** : 0.943668
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002697 | Grad Max: 0.089653
  -> Layer: shared_layers.0.bias | Grad Mean: 0.093267 | Grad Max: 0.395723
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002200 | Grad Max: 0.007326
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002241 | Grad Max: 0.002241
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000723 | Grad Max: 0.179864
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011973 | Grad Max: 1.005135
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.003873
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003273 | Grad Max: 0.019951
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000225
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000785 | Grad Max: 0.002920
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000232 | Grad Max: 0.000758
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000341 | Grad Max: 0.001295
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005656 | Grad Max: 0.005656
[GRADIENT NORM TOTAL] 2.3150

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.469
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50414985 0.49585018] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.538 | Std: 0.046
[MASKS] A(Pass/Fail): 624/1424 | B: 551/1497 | C: 277/1771
[LOSS Ex1] A: 0.66113 | B: 0.65280 | C: 0.65175
[LOGITS Ex2 A] Mean Abs: 1.715 | Max: 6.010
[LOSS Ex2] A: 0.17212 | B: 0.38522 | C: 0.28686
** [JOINT LOSS] ** : 0.936629
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006032 | Grad Max: 0.150363
  -> Layer: shared_layers.0.bias | Grad Mean: 0.317297 | Grad Max: 1.281781
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.006162
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002144 | Grad Max: 0.002144
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.324702
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042218 | Grad Max: 1.844111
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000406 | Grad Max: 0.012556
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020696 | Grad Max: 0.089555
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000641
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004581 | Grad Max: 0.010216
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000304
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001306 | Grad Max: 0.003227
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001618 | Grad Max: 0.003191
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029267 | Grad Max: 0.029267
[GRADIENT NORM TOTAL] 6.6134

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.399
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5341344 0.4658656] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.537 | Std: 0.045
[MASKS] A(Pass/Fail): 612/1436 | B: 507/1349 | C: 187/1189
[LOSS Ex1] A: 0.66088 | B: 0.65628 | C: 0.64872
[LOGITS Ex2 A] Mean Abs: 1.689 | Max: 6.012
[LOSS Ex2] A: 0.18180 | B: 0.37767 | C: 0.30487
** [JOINT LOSS] ** : 0.943408
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007807 | Grad Max: 0.216133
  -> Layer: shared_layers.0.bias | Grad Mean: 0.369193 | Grad Max: 1.449482
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.006344
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001806 | Grad Max: 0.001806
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002512 | Grad Max: 0.351922
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046517 | Grad Max: 1.991943
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000444 | Grad Max: 0.012351
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022345 | Grad Max: 0.094476
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000663
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004991 | Grad Max: 0.010198
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000332
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001443 | Grad Max: 0.003525
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001803 | Grad Max: 0.003239
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033635 | Grad Max: 0.033635
[GRADIENT NORM TOTAL] 7.2816

[EPOCH SUMMARY] Train Loss: 0.9409

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9196 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9219 -> New: 0.9196)

############################## EPOCH 83/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.533
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.66108716 0.33891287] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.047
[MASKS] A(Pass/Fail): 681/1367 | B: 536/1512 | C: 270/1778
[LOSS Ex1] A: 0.65658 | B: 0.65550 | C: 0.65271
[LOGITS Ex2 A] Mean Abs: 1.777 | Max: 6.039
[LOSS Ex2] A: 0.16655 | B: 0.38797 | C: 0.29413
** [JOINT LOSS] ** : 0.937814
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002622 | Grad Max: 0.081762
  -> Layer: shared_layers.0.bias | Grad Mean: 0.053166 | Grad Max: 0.292935
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002162 | Grad Max: 0.006792
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004396 | Grad Max: 0.004396
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000500 | Grad Max: 0.085670
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007964 | Grad Max: 0.483320
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.002133
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001921 | Grad Max: 0.014551
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000154
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000419 | Grad Max: 0.001790
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000068
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000124 | Grad Max: 0.000544
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000372 | Grad Max: 0.001223
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002892 | Grad Max: 0.002892
[GRADIENT NORM TOTAL] 1.3592

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.598
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50044656 0.4995535 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 657/1391 | B: 558/1490 | C: 278/1770
[LOSS Ex1] A: 0.66155 | B: 0.65620 | C: 0.64927
[LOGITS Ex2 A] Mean Abs: 1.810 | Max: 5.947
[LOSS Ex2] A: 0.16631 | B: 0.39600 | C: 0.27261
** [JOINT LOSS] ** : 0.933976
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004741 | Grad Max: 0.161372
  -> Layer: shared_layers.0.bias | Grad Mean: 0.435705 | Grad Max: 2.076736
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.005897
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000312 | Grad Max: 0.000312
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002753 | Grad Max: 0.294160
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051950 | Grad Max: 1.685707
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.016290
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024778 | Grad Max: 0.126564
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000671
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005387 | Grad Max: 0.011297
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000307
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001556 | Grad Max: 0.003509
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001857 | Grad Max: 0.004021
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036150 | Grad Max: 0.036150
[GRADIENT NORM TOTAL] 8.8482

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.389
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6318114  0.36818856] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.047
[MASKS] A(Pass/Fail): 641/1407 | B: 552/1496 | C: 297/1751
[LOSS Ex1] A: 0.65864 | B: 0.65262 | C: 0.64838
[LOGITS Ex2 A] Mean Abs: 1.812 | Max: 6.115
[LOSS Ex2] A: 0.18800 | B: 0.38922 | C: 0.31829
** [JOINT LOSS] ** : 0.951715
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007302 | Grad Max: 0.175053
  -> Layer: shared_layers.0.bias | Grad Mean: 0.505502 | Grad Max: 2.213763
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002240 | Grad Max: 0.006913
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003582 | Grad Max: 0.003582
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003352 | Grad Max: 0.335153
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.062429 | Grad Max: 1.872115
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000581 | Grad Max: 0.019302
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029777 | Grad Max: 0.146140
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000802
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006553 | Grad Max: 0.013387
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000386
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001905 | Grad Max: 0.004346
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002391 | Grad Max: 0.004457
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044892 | Grad Max: 0.044892
[GRADIENT NORM TOTAL] 10.0410

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.465
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5834521 0.4165479] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.047
[MASKS] A(Pass/Fail): 527/1089 | B: 508/1348 | C: 271/1777
[LOSS Ex1] A: 0.65729 | B: 0.65610 | C: 0.65167
[LOGITS Ex2 A] Mean Abs: 1.827 | Max: 5.801
[LOSS Ex2] A: 0.16483 | B: 0.37239 | C: 0.31568
** [JOINT LOSS] ** : 0.939321
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004422 | Grad Max: 0.135030
  -> Layer: shared_layers.0.bias | Grad Mean: 0.184235 | Grad Max: 0.736672
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002143 | Grad Max: 0.007038
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003751 | Grad Max: 0.003751
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001372 | Grad Max: 0.154648
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025140 | Grad Max: 0.863260
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.006666
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011095 | Grad Max: 0.050437
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000379
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002471 | Grad Max: 0.005666
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000162
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000720 | Grad Max: 0.001731
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000930 | Grad Max: 0.002316
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016741 | Grad Max: 0.016741
[GRADIENT NORM TOTAL] 3.8056

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.600
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5066451  0.49335495] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 656/1392 | B: 537/1511 | C: 263/1785
[LOSS Ex1] A: 0.65761 | B: 0.65532 | C: 0.65100
[LOGITS Ex2 A] Mean Abs: 1.780 | Max: 7.449
[LOSS Ex2] A: 0.16512 | B: 0.40602 | C: 0.29181
** [JOINT LOSS] ** : 0.942293
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005520 | Grad Max: 0.158887
  -> Layer: shared_layers.0.bias | Grad Mean: 0.472836 | Grad Max: 2.148050
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.006694
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001117 | Grad Max: 0.001117
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002781 | Grad Max: 0.366372
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052226 | Grad Max: 2.063204
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000516 | Grad Max: 0.017848
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026552 | Grad Max: 0.140814
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000780
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005722 | Grad Max: 0.012057
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000374
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001615 | Grad Max: 0.004076
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001915 | Grad Max: 0.003487
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036620 | Grad Max: 0.036620
[GRADIENT NORM TOTAL] 9.0908

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.550
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5096977  0.49030238] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.047
[MASKS] A(Pass/Fail): 654/1394 | B: 560/1488 | C: 280/1768
[LOSS Ex1] A: 0.65558 | B: 0.65603 | C: 0.65204
[LOGITS Ex2 A] Mean Abs: 1.754 | Max: 6.032
[LOSS Ex2] A: 0.16430 | B: 0.42602 | C: 0.29373
** [JOINT LOSS] ** : 0.949233
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006684 | Grad Max: 0.202576
  -> Layer: shared_layers.0.bias | Grad Mean: 0.632596 | Grad Max: 2.748610
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002213 | Grad Max: 0.007429
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006166 | Grad Max: 0.006166
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003863 | Grad Max: 0.404063
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.073093 | Grad Max: 2.279216
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000721 | Grad Max: 0.023138
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037285 | Grad Max: 0.172367
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001040
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008078 | Grad Max: 0.017286
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000471
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002309 | Grad Max: 0.005287
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002725 | Grad Max: 0.005312
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052785 | Grad Max: 0.052785
[GRADIENT NORM TOTAL] 12.2945

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.567
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5020137  0.49798632] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.047
[MASKS] A(Pass/Fail): 644/1404 | B: 553/1495 | C: 276/1772
[LOSS Ex1] A: 0.65458 | B: 0.65245 | C: 0.64994
[LOGITS Ex2 A] Mean Abs: 1.747 | Max: 6.690
[LOSS Ex2] A: 0.18520 | B: 0.39927 | C: 0.30207
** [JOINT LOSS] ** : 0.947838
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005345 | Grad Max: 0.164002
  -> Layer: shared_layers.0.bias | Grad Mean: 0.467742 | Grad Max: 2.181714
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002343 | Grad Max: 0.006927
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001186 | Grad Max: 0.001186
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002877 | Grad Max: 0.374071
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053277 | Grad Max: 2.100256
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000513 | Grad Max: 0.017129
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026373 | Grad Max: 0.138311
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000813
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005628 | Grad Max: 0.012299
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000334
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001605 | Grad Max: 0.003697
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001942 | Grad Max: 0.003318
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037344 | Grad Max: 0.037344
[GRADIENT NORM TOTAL] 9.2168

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.474
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50411963 0.4958804 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.538 | Std: 0.046
[MASKS] A(Pass/Fail): 624/1424 | B: 508/1348 | C: 298/1750
[LOSS Ex1] A: 0.66088 | B: 0.65595 | C: 0.64852
[LOGITS Ex2 A] Mean Abs: 1.769 | Max: 5.597
[LOSS Ex2] A: 0.16285 | B: 0.37288 | C: 0.29973
** [JOINT LOSS] ** : 0.933600
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001699 | Grad Max: 0.053940
  -> Layer: shared_layers.0.bias | Grad Mean: 0.059197 | Grad Max: 0.272332
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.006060
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004412 | Grad Max: 0.004412
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000534 | Grad Max: 0.086376
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009237 | Grad Max: 0.485125
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002439
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001394 | Grad Max: 0.012988
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000144
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000235 | Grad Max: 0.001521
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000074
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000070 | Grad Max: 0.000521
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000376 | Grad Max: 0.001041
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000413 | Grad Max: 0.000413
[GRADIENT NORM TOTAL] 1.6288

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.403
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5343707  0.46562928] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.538 | Std: 0.045
[MASKS] A(Pass/Fail): 614/1434 | B: 538/1510 | C: 282/1766
[LOSS Ex1] A: 0.66063 | B: 0.65516 | C: 0.65107
[LOGITS Ex2 A] Mean Abs: 1.779 | Max: 5.379
[LOSS Ex2] A: 0.18569 | B: 0.40410 | C: 0.32434
** [JOINT LOSS] ** : 0.960326
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007567 | Grad Max: 0.172572
  -> Layer: shared_layers.0.bias | Grad Mean: 0.433503 | Grad Max: 1.714309
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.006349
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003115 | Grad Max: 0.003115
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002795 | Grad Max: 0.278566
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051780 | Grad Max: 1.471511
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000512 | Grad Max: 0.016698
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026047 | Grad Max: 0.128368
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000797
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005724 | Grad Max: 0.012545
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000366
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001639 | Grad Max: 0.004058
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002004 | Grad Max: 0.003611
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037072 | Grad Max: 0.037072
[GRADIENT NORM TOTAL] 8.1119

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.537
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.66263825 0.33736175] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.048
[MASKS] A(Pass/Fail): 682/1366 | B: 560/1488 | C: 267/1781
[LOSS Ex1] A: 0.65630 | B: 0.65587 | C: 0.65207
[LOGITS Ex2 A] Mean Abs: 1.834 | Max: 5.959
[LOSS Ex2] A: 0.16961 | B: 0.40310 | C: 0.29367
** [JOINT LOSS] ** : 0.943537
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006316 | Grad Max: 0.162133
  -> Layer: shared_layers.0.bias | Grad Mean: 0.456611 | Grad Max: 1.830347
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002227 | Grad Max: 0.007174
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008691 | Grad Max: 0.008691
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002874 | Grad Max: 0.295177
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053149 | Grad Max: 1.647207
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000542 | Grad Max: 0.017463
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027833 | Grad Max: 0.142669
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000718
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006133 | Grad Max: 0.012556
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000357
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001776 | Grad Max: 0.003922
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002146 | Grad Max: 0.004234
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041032 | Grad Max: 0.041032
[GRADIENT NORM TOTAL] 8.7434

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.603
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50048244 0.49951756] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.047
[MASKS] A(Pass/Fail): 658/1390 | B: 554/1494 | C: 297/1751
[LOSS Ex1] A: 0.66130 | B: 0.65229 | C: 0.64905
[LOGITS Ex2 A] Mean Abs: 1.806 | Max: 5.686
[LOSS Ex2] A: 0.16455 | B: 0.37203 | C: 0.27610
** [JOINT LOSS] ** : 0.925104
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003013 | Grad Max: 0.092531
  -> Layer: shared_layers.0.bias | Grad Mean: 0.140499 | Grad Max: 0.520390
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002209 | Grad Max: 0.006805
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008023 | Grad Max: 0.008023
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001061 | Grad Max: 0.228574
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018470 | Grad Max: 1.275674
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000134 | Grad Max: 0.005831
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006664 | Grad Max: 0.045544
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000287
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001339 | Grad Max: 0.003784
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000381 | Grad Max: 0.001052
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000513 | Grad Max: 0.001713
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008780 | Grad Max: 0.008780
[GRADIENT NORM TOTAL] 3.3142

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.393
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6329708  0.36702922] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.047
[MASKS] A(Pass/Fail): 642/1406 | B: 508/1348 | C: 279/1769
[LOSS Ex1] A: 0.65837 | B: 0.65579 | C: 0.64994
[LOGITS Ex2 A] Mean Abs: 1.752 | Max: 6.140
[LOSS Ex2] A: 0.18807 | B: 0.38013 | C: 0.30552
** [JOINT LOSS] ** : 0.945941
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008201 | Grad Max: 0.240744
  -> Layer: shared_layers.0.bias | Grad Mean: 0.462725 | Grad Max: 2.047340
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.007180
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004799 | Grad Max: 0.004799
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003026 | Grad Max: 0.382904
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056075 | Grad Max: 2.142964
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000531 | Grad Max: 0.015534
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027123 | Grad Max: 0.117988
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000739
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006033 | Grad Max: 0.012423
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000353
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001735 | Grad Max: 0.004007
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002139 | Grad Max: 0.003591
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039762 | Grad Max: 0.039762
[GRADIENT NORM TOTAL] 9.1240

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.469
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58422786 0.4157721 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.048
[MASKS] A(Pass/Fail): 530/1086 | B: 538/1510 | C: 247/1801
[LOSS Ex1] A: 0.65702 | B: 0.65500 | C: 0.65391
[LOGITS Ex2 A] Mean Abs: 1.795 | Max: 6.171
[LOSS Ex2] A: 0.17504 | B: 0.42150 | C: 0.31347
** [JOINT LOSS] ** : 0.958646
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010125 | Grad Max: 0.284202
  -> Layer: shared_layers.0.bias | Grad Mean: 0.593720 | Grad Max: 2.488721
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002120 | Grad Max: 0.006767
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002535 | Grad Max: 0.002535
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003883 | Grad Max: 0.505117
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.071966 | Grad Max: 2.800072
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000685 | Grad Max: 0.019504
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035050 | Grad Max: 0.164058
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000996
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007775 | Grad Max: 0.016405
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000453
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002250 | Grad Max: 0.004982
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002864 | Grad Max: 0.005658
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052888 | Grad Max: 0.052888
[GRADIENT NORM TOTAL] 11.5784

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.604
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50669533 0.4933046 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 661/1387 | B: 560/1488 | C: 180/1196
[LOSS Ex1] A: 0.65735 | B: 0.65572 | C: 0.65063
[LOGITS Ex2 A] Mean Abs: 1.781 | Max: 5.874
[LOSS Ex2] A: 0.16606 | B: 0.39763 | C: 0.29773
** [JOINT LOSS] ** : 0.941706
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006566 | Grad Max: 0.174344
  -> Layer: shared_layers.0.bias | Grad Mean: 0.364602 | Grad Max: 1.489672
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.006289
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000901 | Grad Max: 0.000901
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002485 | Grad Max: 0.277678
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046136 | Grad Max: 1.515424
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000442 | Grad Max: 0.016310
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022577 | Grad Max: 0.122459
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000657
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004988 | Grad Max: 0.010293
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000281
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001447 | Grad Max: 0.003284
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001883 | Grad Max: 0.003451
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034556 | Grad Max: 0.034556
[GRADIENT NORM TOTAL] 7.1693

[EPOCH SUMMARY] Train Loss: 0.9436

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9203 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 84/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.555
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5097768  0.49022323] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 657/1391 | B: 555/1493 | C: 301/1747
[LOSS Ex1] A: 0.65531 | B: 0.65214 | C: 0.64687
[LOGITS Ex2 A] Mean Abs: 1.818 | Max: 6.568
[LOSS Ex2] A: 0.17049 | B: 0.36551 | C: 0.30451
** [JOINT LOSS] ** : 0.931606
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002891 | Grad Max: 0.080780
  -> Layer: shared_layers.0.bias | Grad Mean: 0.227049 | Grad Max: 1.069606
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002354 | Grad Max: 0.007033
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000328 | Grad Max: 0.000328
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001428 | Grad Max: 0.169587
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026548 | Grad Max: 0.952679
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000242 | Grad Max: 0.010801
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012478 | Grad Max: 0.073207
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000418
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002680 | Grad Max: 0.006001
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000189
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000760 | Grad Max: 0.002076
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000832 | Grad Max: 0.002161
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016654 | Grad Max: 0.016654
[GRADIENT NORM TOTAL] 4.5592

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.572
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5020575 0.4979425] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.047
[MASKS] A(Pass/Fail): 648/1400 | B: 509/1347 | C: 280/1768
[LOSS Ex1] A: 0.65431 | B: 0.65565 | C: 0.65088
[LOGITS Ex2 A] Mean Abs: 1.825 | Max: 7.315
[LOSS Ex2] A: 0.19226 | B: 0.36604 | C: 0.30118
** [JOINT LOSS] ** : 0.940105
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005924 | Grad Max: 0.139214
  -> Layer: shared_layers.0.bias | Grad Mean: 0.377178 | Grad Max: 1.744680
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.007051
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003563 | Grad Max: 0.003563
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002416 | Grad Max: 0.233308
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044826 | Grad Max: 1.289653
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000438 | Grad Max: 0.014844
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022577 | Grad Max: 0.114792
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000677
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004981 | Grad Max: 0.010689
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000312
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001443 | Grad Max: 0.003406
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001782 | Grad Max: 0.003363
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033313 | Grad Max: 0.033313
[GRADIENT NORM TOTAL] 7.3642

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.477
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041263 0.4958737] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.538 | Std: 0.047
[MASKS] A(Pass/Fail): 626/1422 | B: 539/1509 | C: 264/1784
[LOSS Ex1] A: 0.66064 | B: 0.65487 | C: 0.65191
[LOGITS Ex2 A] Mean Abs: 1.804 | Max: 5.959
[LOSS Ex2] A: 0.16755 | B: 0.38813 | C: 0.28832
** [JOINT LOSS] ** : 0.937141
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003756 | Grad Max: 0.129616
  -> Layer: shared_layers.0.bias | Grad Mean: 0.225316 | Grad Max: 1.091992
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002032 | Grad Max: 0.005748
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000610 | Grad Max: 0.000610
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001408 | Grad Max: 0.148753
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026269 | Grad Max: 0.820112
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.009459
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013144 | Grad Max: 0.074282
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000404
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002865 | Grad Max: 0.005981
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000187
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000823 | Grad Max: 0.001942
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000964 | Grad Max: 0.002524
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018813 | Grad Max: 0.018813
[GRADIENT NORM TOTAL] 4.4240

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.406
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53463125 0.46536872] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.538 | Std: 0.046
[MASKS] A(Pass/Fail): 615/1433 | B: 561/1487 | C: 273/1775
[LOSS Ex1] A: 0.66040 | B: 0.65559 | C: 0.65071
[LOGITS Ex2 A] Mean Abs: 1.717 | Max: 6.146
[LOSS Ex2] A: 0.17972 | B: 0.39961 | C: 0.29802
** [JOINT LOSS] ** : 0.948018
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003424 | Grad Max: 0.096112
  -> Layer: shared_layers.0.bias | Grad Mean: 0.301752 | Grad Max: 1.244625
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.006328
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005556 | Grad Max: 0.005556
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001920 | Grad Max: 0.253342
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036009 | Grad Max: 1.400724
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000347 | Grad Max: 0.012915
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018014 | Grad Max: 0.094839
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000510
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003915 | Grad Max: 0.008256
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000235
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001135 | Grad Max: 0.002643
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001449 | Grad Max: 0.002622
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027296 | Grad Max: 0.027296
[GRADIENT NORM TOTAL] 5.8941

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.541
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.66400915 0.3359909 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.048
[MASKS] A(Pass/Fail): 682/1366 | B: 556/1492 | C: 270/1778
[LOSS Ex1] A: 0.65604 | B: 0.65200 | C: 0.64929
[LOGITS Ex2 A] Mean Abs: 1.777 | Max: 6.546
[LOSS Ex2] A: 0.16034 | B: 0.38515 | C: 0.28941
** [JOINT LOSS] ** : 0.930745
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005143 | Grad Max: 0.113626
  -> Layer: shared_layers.0.bias | Grad Mean: 0.403334 | Grad Max: 1.620932
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002332 | Grad Max: 0.007016
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002944 | Grad Max: 0.002944
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002489 | Grad Max: 0.305103
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047049 | Grad Max: 1.700200
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000447 | Grad Max: 0.013144
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023105 | Grad Max: 0.107401
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000680
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005030 | Grad Max: 0.010346
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000296
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001445 | Grad Max: 0.003373
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001761 | Grad Max: 0.003394
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033182 | Grad Max: 0.033182
[GRADIENT NORM TOTAL] 7.6532

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.607
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004343  0.49956572] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.047
[MASKS] A(Pass/Fail): 660/1388 | B: 512/1344 | C: 261/1787
[LOSS Ex1] A: 0.66107 | B: 0.65552 | C: 0.65270
[LOGITS Ex2 A] Mean Abs: 1.798 | Max: 6.785
[LOSS Ex2] A: 0.15181 | B: 0.37341 | C: 0.31324
** [JOINT LOSS] ** : 0.935917
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001809 | Grad Max: 0.044971
  -> Layer: shared_layers.0.bias | Grad Mean: 0.144278 | Grad Max: 0.560731
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.006219
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004490 | Grad Max: 0.004490
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000930 | Grad Max: 0.119309
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016817 | Grad Max: 0.671055
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.006542
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007363 | Grad Max: 0.048967
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000272
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001598 | Grad Max: 0.003865
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000144
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000469 | Grad Max: 0.001457
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000619 | Grad Max: 0.001630
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011681 | Grad Max: 0.011681
[GRADIENT NORM TOTAL] 2.8517

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.396
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6339124 0.3660876] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 644/1404 | B: 539/1509 | C: 287/1761
[LOSS Ex1] A: 0.65814 | B: 0.65474 | C: 0.64871
[LOGITS Ex2 A] Mean Abs: 1.823 | Max: 6.080
[LOSS Ex2] A: 0.18349 | B: 0.39300 | C: 0.30871
** [JOINT LOSS] ** : 0.948928
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005557 | Grad Max: 0.145253
  -> Layer: shared_layers.0.bias | Grad Mean: 0.407777 | Grad Max: 1.880695
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002170 | Grad Max: 0.006887
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002707 | Grad Max: 0.002707
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002662 | Grad Max: 0.274344
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049486 | Grad Max: 1.526621
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000469 | Grad Max: 0.016278
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024263 | Grad Max: 0.134811
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000725
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005301 | Grad Max: 0.010885
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000299
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001531 | Grad Max: 0.003422
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001915 | Grad Max: 0.003405
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035526 | Grad Max: 0.035526
[GRADIENT NORM TOTAL] 8.1892

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.472
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5847473 0.4152527] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.048
[MASKS] A(Pass/Fail): 530/1086 | B: 563/1485 | C: 290/1758
[LOSS Ex1] A: 0.65678 | B: 0.65546 | C: 0.64920
[LOGITS Ex2 A] Mean Abs: 1.861 | Max: 5.270
[LOSS Ex2] A: 0.16918 | B: 0.40049 | C: 0.31807
** [JOINT LOSS] ** : 0.949722
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007057 | Grad Max: 0.204095
  -> Layer: shared_layers.0.bias | Grad Mean: 0.535982 | Grad Max: 2.553962
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.006737
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004335 | Grad Max: 0.004335
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003452 | Grad Max: 0.336403
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.063999 | Grad Max: 1.911792
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000586 | Grad Max: 0.018530
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030297 | Grad Max: 0.142962
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000875
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006690 | Grad Max: 0.014084
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000382
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001920 | Grad Max: 0.004523
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002267 | Grad Max: 0.004120
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043277 | Grad Max: 0.043277
[GRADIENT NORM TOTAL] 10.7333

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.608
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067628 0.4932372] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.048
[MASKS] A(Pass/Fail): 661/1387 | B: 557/1491 | C: 273/1775
[LOSS Ex1] A: 0.65712 | B: 0.65186 | C: 0.64960
[LOGITS Ex2 A] Mean Abs: 1.823 | Max: 6.869
[LOSS Ex2] A: 0.16803 | B: 0.37817 | C: 0.29352
** [JOINT LOSS] ** : 0.932767
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004845 | Grad Max: 0.148016
  -> Layer: shared_layers.0.bias | Grad Mean: 0.351495 | Grad Max: 1.663359
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.006836
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006575 | Grad Max: 0.006575
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.242251
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041686 | Grad Max: 1.363715
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.014889
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020134 | Grad Max: 0.110313
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000545
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004428 | Grad Max: 0.008924
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000277
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001276 | Grad Max: 0.003171
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001501 | Grad Max: 0.003375
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029182 | Grad Max: 0.029182
[GRADIENT NORM TOTAL] 7.0651

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.559
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50981414 0.49018592] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 657/1391 | B: 512/1344 | C: 269/1779
[LOSS Ex1] A: 0.65505 | B: 0.65539 | C: 0.65169
[LOGITS Ex2 A] Mean Abs: 1.775 | Max: 6.692
[LOSS Ex2] A: 0.16507 | B: 0.37184 | C: 0.29435
** [JOINT LOSS] ** : 0.931131
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002852 | Grad Max: 0.070220
  -> Layer: shared_layers.0.bias | Grad Mean: 0.221271 | Grad Max: 0.975239
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.007294
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006122 | Grad Max: 0.006122
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001415 | Grad Max: 0.189478
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026266 | Grad Max: 1.078609
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000234 | Grad Max: 0.009658
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012176 | Grad Max: 0.071733
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000420
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002668 | Grad Max: 0.006260
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000183
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000756 | Grad Max: 0.001906
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000832 | Grad Max: 0.002035
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016029 | Grad Max: 0.016029
[GRADIENT NORM TOTAL] 4.5358

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.576
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5021423  0.49785763] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 649/1399 | B: 541/1507 | C: 261/1787
[LOSS Ex1] A: 0.65405 | B: 0.65461 | C: 0.65186
[LOGITS Ex2 A] Mean Abs: 1.768 | Max: 7.075
[LOSS Ex2] A: 0.18829 | B: 0.40786 | C: 0.29326
** [JOINT LOSS] ** : 0.949980
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005642 | Grad Max: 0.170055
  -> Layer: shared_layers.0.bias | Grad Mean: 0.385804 | Grad Max: 1.797332
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002236 | Grad Max: 0.007480
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005884 | Grad Max: 0.005884
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002414 | Grad Max: 0.286624
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045641 | Grad Max: 1.549579
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000417 | Grad Max: 0.014059
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021606 | Grad Max: 0.111518
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000654
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004697 | Grad Max: 0.009922
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000268
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001353 | Grad Max: 0.003091
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001696 | Grad Max: 0.003142
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031357 | Grad Max: 0.031357
[GRADIENT NORM TOTAL] 7.6559

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.480
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041737 0.4958263] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.047
[MASKS] A(Pass/Fail): 627/1421 | B: 563/1485 | C: 299/1749
[LOSS Ex1] A: 0.66043 | B: 0.65534 | C: 0.65009
[LOGITS Ex2 A] Mean Abs: 1.750 | Max: 6.190
[LOSS Ex2] A: 0.15665 | B: 0.38902 | C: 0.28469
** [JOINT LOSS] ** : 0.932071
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003569 | Grad Max: 0.097914
  -> Layer: shared_layers.0.bias | Grad Mean: 0.176771 | Grad Max: 0.750533
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.005750
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003961 | Grad Max: 0.003961
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001227 | Grad Max: 0.199659
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022758 | Grad Max: 1.122902
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000215 | Grad Max: 0.008156
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011087 | Grad Max: 0.055936
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000342
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002431 | Grad Max: 0.005651
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000169
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000703 | Grad Max: 0.001770
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000973 | Grad Max: 0.002399
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016978 | Grad Max: 0.016978
[GRADIENT NORM TOTAL] 3.7905

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.409
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5347846  0.46521538] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.538 | Std: 0.046
[MASKS] A(Pass/Fail): 618/1430 | B: 557/1491 | C: 299/1749
[LOSS Ex1] A: 0.66020 | B: 0.65173 | C: 0.64746
[LOGITS Ex2 A] Mean Abs: 1.757 | Max: 6.147
[LOSS Ex2] A: 0.17407 | B: 0.36926 | C: 0.28423
** [JOINT LOSS] ** : 0.928982
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004923 | Grad Max: 0.125996
  -> Layer: shared_layers.0.bias | Grad Mean: 0.386704 | Grad Max: 1.571955
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.006436
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005506 | Grad Max: 0.005506
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002394 | Grad Max: 0.283996
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044941 | Grad Max: 1.533026
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000426 | Grad Max: 0.016113
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022181 | Grad Max: 0.125093
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000629
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004849 | Grad Max: 0.010594
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000280
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001399 | Grad Max: 0.003269
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001597 | Grad Max: 0.003582
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031488 | Grad Max: 0.031488
[GRADIENT NORM TOTAL] 7.5523

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.544
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6651136  0.33488637] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.048
[MASKS] A(Pass/Fail): 685/1363 | B: 513/1343 | C: 171/1205
[LOSS Ex1] A: 0.65583 | B: 0.65527 | C: 0.65288
[LOGITS Ex2 A] Mean Abs: 1.804 | Max: 6.240
[LOSS Ex2] A: 0.17176 | B: 0.37314 | C: 0.28313
** [JOINT LOSS] ** : 0.930669
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007322 | Grad Max: 0.180582
  -> Layer: shared_layers.0.bias | Grad Mean: 0.461556 | Grad Max: 1.864977
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.007000
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005150 | Grad Max: 0.005150
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002978 | Grad Max: 0.299937
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055297 | Grad Max: 1.646744
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000529 | Grad Max: 0.017806
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027446 | Grad Max: 0.144206
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000741
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006113 | Grad Max: 0.011975
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000329
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001794 | Grad Max: 0.004019
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002169 | Grad Max: 0.004553
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041783 | Grad Max: 0.041783
[GRADIENT NORM TOTAL] 8.9401

[EPOCH SUMMARY] Train Loss: 0.9377

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9196 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9196 -> New: 0.9196)

############################## EPOCH 85/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.611
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50039464 0.49960533] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 662/1386 | B: 542/1506 | C: 296/1752
[LOSS Ex1] A: 0.66088 | B: 0.65449 | C: 0.64767
[LOGITS Ex2 A] Mean Abs: 1.795 | Max: 5.820
[LOSS Ex2] A: 0.16130 | B: 0.39623 | C: 0.30902
** [JOINT LOSS] ** : 0.943199
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002925 | Grad Max: 0.080279
  -> Layer: shared_layers.0.bias | Grad Mean: 0.245247 | Grad Max: 0.993112
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.006140
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001621 | Grad Max: 0.001621
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001541 | Grad Max: 0.147479
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028614 | Grad Max: 0.821046
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000272 | Grad Max: 0.008957
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014195 | Grad Max: 0.073036
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000453
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003081 | Grad Max: 0.006822
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000217
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000887 | Grad Max: 0.002346
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000983 | Grad Max: 0.002394
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019824 | Grad Max: 0.019824
[GRADIENT NORM TOTAL] 4.7515

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.399
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63472915 0.36527085] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 645/1403 | B: 563/1485 | C: 286/1762
[LOSS Ex1] A: 0.65793 | B: 0.65522 | C: 0.64731
[LOGITS Ex2 A] Mean Abs: 1.756 | Max: 5.848
[LOSS Ex2] A: 0.18105 | B: 0.39248 | C: 0.29020
** [JOINT LOSS] ** : 0.941397
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005830 | Grad Max: 0.144003
  -> Layer: shared_layers.0.bias | Grad Mean: 0.347118 | Grad Max: 1.647611
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.006435
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001272 | Grad Max: 0.001272
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.364225
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042758 | Grad Max: 2.036950
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000390 | Grad Max: 0.013523
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020097 | Grad Max: 0.100775
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000663
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004452 | Grad Max: 0.009699
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000283
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001291 | Grad Max: 0.003352
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001571 | Grad Max: 0.002949
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029610 | Grad Max: 0.029610
[GRADIENT NORM TOTAL] 7.0656

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.474
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5852907  0.41470936] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.048
[MASKS] A(Pass/Fail): 532/1084 | B: 557/1491 | C: 253/1795
[LOSS Ex1] A: 0.65657 | B: 0.65161 | C: 0.65571
[LOGITS Ex2 A] Mean Abs: 1.767 | Max: 6.160
[LOSS Ex2] A: 0.17511 | B: 0.39099 | C: 0.29126
** [JOINT LOSS] ** : 0.940413
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008297 | Grad Max: 0.218964
  -> Layer: shared_layers.0.bias | Grad Mean: 0.524342 | Grad Max: 2.093285
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.006589
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000919 | Grad Max: 0.000919
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003386 | Grad Max: 0.386591
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.063439 | Grad Max: 2.094940
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000605 | Grad Max: 0.019528
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031442 | Grad Max: 0.157913
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000828
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006921 | Grad Max: 0.013632
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000386
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002007 | Grad Max: 0.004402
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002480 | Grad Max: 0.004407
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046473 | Grad Max: 0.046473
[GRADIENT NORM TOTAL] 10.2472

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.612
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50679153 0.49320844] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 662/1386 | B: 513/1343 | C: 293/1755
[LOSS Ex1] A: 0.65692 | B: 0.65515 | C: 0.64734
[LOGITS Ex2 A] Mean Abs: 1.789 | Max: 5.801
[LOSS Ex2] A: 0.16845 | B: 0.37682 | C: 0.30470
** [JOINT LOSS] ** : 0.936464
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007531 | Grad Max: 0.213606
  -> Layer: shared_layers.0.bias | Grad Mean: 0.366047 | Grad Max: 1.431149
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.006941
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004224 | Grad Max: 0.004224
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002376 | Grad Max: 0.271496
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043656 | Grad Max: 1.484161
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000415 | Grad Max: 0.013380
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021320 | Grad Max: 0.098325
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000618
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004777 | Grad Max: 0.009637
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000317
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001390 | Grad Max: 0.003562
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001703 | Grad Max: 0.002992
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032102 | Grad Max: 0.032102
[GRADIENT NORM TOTAL] 6.9777

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.562
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.509851   0.49014902] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.048
[MASKS] A(Pass/Fail): 657/1391 | B: 543/1505 | C: 275/1773
[LOSS Ex1] A: 0.65485 | B: 0.65437 | C: 0.64997
[LOGITS Ex2 A] Mean Abs: 1.806 | Max: 6.394
[LOSS Ex2] A: 0.16694 | B: 0.38740 | C: 0.29520
** [JOINT LOSS] ** : 0.936243
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002743 | Grad Max: 0.062419
  -> Layer: shared_layers.0.bias | Grad Mean: 0.161622 | Grad Max: 0.828384
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002207 | Grad Max: 0.006384
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003889 | Grad Max: 0.003889
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001047 | Grad Max: 0.155894
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019150 | Grad Max: 0.884593
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000158 | Grad Max: 0.006366
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008156 | Grad Max: 0.047500
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000274
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001830 | Grad Max: 0.004639
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000134
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000535 | Grad Max: 0.001424
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000590 | Grad Max: 0.001813
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012170 | Grad Max: 0.012170
[GRADIENT NORM TOTAL] 3.3117

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.580
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5021919  0.49780813] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.048
[MASKS] A(Pass/Fail): 651/1397 | B: 563/1485 | C: 310/1738
[LOSS Ex1] A: 0.65385 | B: 0.65510 | C: 0.64614
[LOGITS Ex2 A] Mean Abs: 1.814 | Max: 6.428
[LOSS Ex2] A: 0.18668 | B: 0.39612 | C: 0.29257
** [JOINT LOSS] ** : 0.943487
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007860 | Grad Max: 0.303933
  -> Layer: shared_layers.0.bias | Grad Mean: 0.444824 | Grad Max: 1.837051
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002292 | Grad Max: 0.007282
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000491 | Grad Max: 0.000491
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003068 | Grad Max: 0.297850
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056213 | Grad Max: 1.690377
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000512 | Grad Max: 0.015826
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026307 | Grad Max: 0.124009
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000791
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005919 | Grad Max: 0.012702
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000352
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001719 | Grad Max: 0.004162
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001990 | Grad Max: 0.004022
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038662 | Grad Max: 0.038662
[GRADIENT NORM TOTAL] 9.0111

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.483
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041973 0.4958027] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.047
[MASKS] A(Pass/Fail): 628/1420 | B: 557/1491 | C: 245/1803
[LOSS Ex1] A: 0.66025 | B: 0.65149 | C: 0.65215
[LOGITS Ex2 A] Mean Abs: 1.785 | Max: 6.162
[LOSS Ex2] A: 0.16445 | B: 0.37024 | C: 0.28893
** [JOINT LOSS] ** : 0.929175
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003738 | Grad Max: 0.117867
  -> Layer: shared_layers.0.bias | Grad Mean: 0.187586 | Grad Max: 0.873235
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005648
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002412 | Grad Max: 0.002412
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001247 | Grad Max: 0.158090
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022876 | Grad Max: 0.839845
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.007941
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010651 | Grad Max: 0.056878
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000330
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002377 | Grad Max: 0.005094
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000167
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000694 | Grad Max: 0.001659
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000791 | Grad Max: 0.002614
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016025 | Grad Max: 0.016025
[GRADIENT NORM TOTAL] 3.7682

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.412
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5349357  0.46506432] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.538 | Std: 0.046
[MASKS] A(Pass/Fail): 620/1428 | B: 513/1343 | C: 288/1760
[LOSS Ex1] A: 0.66001 | B: 0.65504 | C: 0.65041
[LOGITS Ex2 A] Mean Abs: 1.714 | Max: 6.062
[LOSS Ex2] A: 0.16967 | B: 0.37170 | C: 0.30777
** [JOINT LOSS] ** : 0.938200
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003769 | Grad Max: 0.092711
  -> Layer: shared_layers.0.bias | Grad Mean: 0.287839 | Grad Max: 1.173489
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002024 | Grad Max: 0.006005
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001436 | Grad Max: 0.001436
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001742 | Grad Max: 0.297394
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032610 | Grad Max: 1.679518
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000293 | Grad Max: 0.010035
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015334 | Grad Max: 0.075053
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000462
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003342 | Grad Max: 0.007835
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000227
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000959 | Grad Max: 0.002641
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001080 | Grad Max: 0.002401
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021341 | Grad Max: 0.021341
[GRADIENT NORM TOTAL] 5.7329

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.547
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.66616076 0.3338393 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.049
[MASKS] A(Pass/Fail): 686/1362 | B: 543/1505 | C: 267/1781
[LOSS Ex1] A: 0.65562 | B: 0.65426 | C: 0.64955
[LOGITS Ex2 A] Mean Abs: 1.750 | Max: 6.022
[LOSS Ex2] A: 0.16296 | B: 0.40958 | C: 0.27953
** [JOINT LOSS] ** : 0.937168
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004074 | Grad Max: 0.126571
  -> Layer: shared_layers.0.bias | Grad Mean: 0.396913 | Grad Max: 1.748523
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.007099
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006388 | Grad Max: 0.006388
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002445 | Grad Max: 0.338056
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045923 | Grad Max: 1.887187
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000423 | Grad Max: 0.014132
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022316 | Grad Max: 0.108614
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000655
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004865 | Grad Max: 0.010197
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000286
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001401 | Grad Max: 0.003409
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001641 | Grad Max: 0.003311
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031713 | Grad Max: 0.031713
[GRADIENT NORM TOTAL] 7.9458

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.614
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003738  0.49962622] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 662/1386 | B: 563/1485 | C: 268/1780
[LOSS Ex1] A: 0.66070 | B: 0.65499 | C: 0.65206
[LOGITS Ex2 A] Mean Abs: 1.779 | Max: 5.580
[LOSS Ex2] A: 0.15400 | B: 0.39717 | C: 0.28844
** [JOINT LOSS] ** : 0.935783
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003422 | Grad Max: 0.100730
  -> Layer: shared_layers.0.bias | Grad Mean: 0.170546 | Grad Max: 0.632470
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002009 | Grad Max: 0.005759
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000527 | Grad Max: 0.000527
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001111 | Grad Max: 0.204566
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019932 | Grad Max: 1.148082
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000176 | Grad Max: 0.006125
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008934 | Grad Max: 0.046858
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000334
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002015 | Grad Max: 0.005361
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000145
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000574 | Grad Max: 0.001585
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000649 | Grad Max: 0.001853
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012656 | Grad Max: 0.012656
[GRADIENT NORM TOTAL] 3.4402

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.402
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63549095 0.364509  ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 645/1403 | B: 557/1491 | C: 275/1773
[LOSS Ex1] A: 0.65773 | B: 0.65137 | C: 0.65149
[LOGITS Ex2 A] Mean Abs: 1.803 | Max: 5.732
[LOSS Ex2] A: 0.18145 | B: 0.37555 | C: 0.28557
** [JOINT LOSS] ** : 0.934391
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004363 | Grad Max: 0.125121
  -> Layer: shared_layers.0.bias | Grad Mean: 0.361128 | Grad Max: 1.610136
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.006543
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000977 | Grad Max: 0.000977
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002305 | Grad Max: 0.236972
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043149 | Grad Max: 1.350258
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.012164
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020566 | Grad Max: 0.095078
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000583
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004498 | Grad Max: 0.009557
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000252
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001297 | Grad Max: 0.002966
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001478 | Grad Max: 0.003200
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029457 | Grad Max: 0.029457
[GRADIENT NORM TOTAL] 7.1588

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.477
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5857211  0.41427898] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.048
[MASKS] A(Pass/Fail): 533/1083 | B: 513/1343 | C: 270/1778
[LOSS Ex1] A: 0.65636 | B: 0.65492 | C: 0.64925
[LOGITS Ex2 A] Mean Abs: 1.862 | Max: 5.635
[LOSS Ex2] A: 0.17588 | B: 0.36996 | C: 0.31226
** [JOINT LOSS] ** : 0.939545
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005976 | Grad Max: 0.159170
  -> Layer: shared_layers.0.bias | Grad Mean: 0.458288 | Grad Max: 2.049618
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.006865
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001572 | Grad Max: 0.001572
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002946 | Grad Max: 0.288431
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055510 | Grad Max: 1.591266
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000502 | Grad Max: 0.017483
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026266 | Grad Max: 0.137491
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000751
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005775 | Grad Max: 0.012288
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000313
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001677 | Grad Max: 0.003649
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001917 | Grad Max: 0.003686
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037762 | Grad Max: 0.037762
[GRADIENT NORM TOTAL] 9.0633

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.615
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50685906 0.49314094] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 664/1384 | B: 544/1504 | C: 279/1769
[LOSS Ex1] A: 0.65672 | B: 0.65414 | C: 0.64943
[LOGITS Ex2 A] Mean Abs: 1.806 | Max: 6.585
[LOSS Ex2] A: 0.16408 | B: 0.38993 | C: 0.29863
** [JOINT LOSS] ** : 0.937640
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002925 | Grad Max: 0.085773
  -> Layer: shared_layers.0.bias | Grad Mean: 0.254093 | Grad Max: 0.981542
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002241 | Grad Max: 0.007075
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007081 | Grad Max: 0.007081
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001578 | Grad Max: 0.175106
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029332 | Grad Max: 0.980644
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000270 | Grad Max: 0.010197
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014130 | Grad Max: 0.073503
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000397
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003073 | Grad Max: 0.006920
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000189
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000902 | Grad Max: 0.002013
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001035 | Grad Max: 0.002564
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020949 | Grad Max: 0.020949
[GRADIENT NORM TOTAL] 4.9337

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.566
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50987995 0.49012005] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 657/1391 | B: 563/1485 | C: 208/1168
[LOSS Ex1] A: 0.65462 | B: 0.65488 | C: 0.64752
[LOGITS Ex2 A] Mean Abs: 1.771 | Max: 5.383
[LOSS Ex2] A: 0.16084 | B: 0.39900 | C: 0.30348
** [JOINT LOSS] ** : 0.940114
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004468 | Grad Max: 0.116965
  -> Layer: shared_layers.0.bias | Grad Mean: 0.341547 | Grad Max: 1.542865
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.006800
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001744 | Grad Max: 0.001744
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.250037
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041592 | Grad Max: 1.422090
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000390 | Grad Max: 0.012062
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020342 | Grad Max: 0.095090
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000623
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004490 | Grad Max: 0.009993
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000257
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001303 | Grad Max: 0.003144
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001486 | Grad Max: 0.002875
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029343 | Grad Max: 0.029343
[GRADIENT NORM TOTAL] 6.8264

[EPOCH SUMMARY] Train Loss: 0.9381

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9244 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 86/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.583
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50223905 0.49776092] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.048
[MASKS] A(Pass/Fail): 652/1396 | B: 557/1491 | C: 275/1773
[LOSS Ex1] A: 0.65363 | B: 0.65125 | C: 0.64881
[LOGITS Ex2 A] Mean Abs: 1.737 | Max: 6.561
[LOSS Ex2] A: 0.18662 | B: 0.39080 | C: 0.29311
** [JOINT LOSS] ** : 0.941409
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005105 | Grad Max: 0.147720
  -> Layer: shared_layers.0.bias | Grad Mean: 0.418377 | Grad Max: 2.047090
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002346 | Grad Max: 0.007466
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003013 | Grad Max: 0.003013
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002631 | Grad Max: 0.301526
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049612 | Grad Max: 1.678511
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.014436
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023526 | Grad Max: 0.114790
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000691
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005085 | Grad Max: 0.011396
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000319
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001452 | Grad Max: 0.003571
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001691 | Grad Max: 0.003123
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032441 | Grad Max: 0.032441
[GRADIENT NORM TOTAL] 8.3143

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.486
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50420094 0.4957991 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.048
[MASKS] A(Pass/Fail): 630/1418 | B: 514/1342 | C: 262/1786
[LOSS Ex1] A: 0.66006 | B: 0.65481 | C: 0.65186
[LOGITS Ex2 A] Mean Abs: 1.727 | Max: 6.020
[LOSS Ex2] A: 0.16692 | B: 0.36347 | C: 0.29925
** [JOINT LOSS] ** : 0.932120
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003607 | Grad Max: 0.076069
  -> Layer: shared_layers.0.bias | Grad Mean: 0.237405 | Grad Max: 1.108621
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002031 | Grad Max: 0.005617
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003129 | Grad Max: 0.003129
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001657 | Grad Max: 0.238215
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030976 | Grad Max: 1.332285
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000290 | Grad Max: 0.009575
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015169 | Grad Max: 0.067861
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000497
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003345 | Grad Max: 0.007745
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000252
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000976 | Grad Max: 0.002608
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001191 | Grad Max: 0.002717
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022491 | Grad Max: 0.022491
[GRADIENT NORM TOTAL] 5.0342

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.415
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5350715 0.4649285] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.046
[MASKS] A(Pass/Fail): 620/1428 | B: 544/1504 | C: 262/1786
[LOSS Ex1] A: 0.65983 | B: 0.65403 | C: 0.65286
[LOGITS Ex2 A] Mean Abs: 1.752 | Max: 5.918
[LOSS Ex2] A: 0.17617 | B: 0.38583 | C: 0.32062
** [JOINT LOSS] ** : 0.949779
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004283 | Grad Max: 0.119861
  -> Layer: shared_layers.0.bias | Grad Mean: 0.266828 | Grad Max: 1.207068
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002003 | Grad Max: 0.006108
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002345 | Grad Max: 0.002345
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001695 | Grad Max: 0.173010
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031268 | Grad Max: 0.891740
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000288 | Grad Max: 0.008211
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015033 | Grad Max: 0.069502
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000472
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003338 | Grad Max: 0.007122
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000214
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000955 | Grad Max: 0.002386
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001100 | Grad Max: 0.002550
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020975 | Grad Max: 0.020975
[GRADIENT NORM TOTAL] 5.1132

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.551
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.66726863 0.33273134] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.049
[MASKS] A(Pass/Fail): 687/1361 | B: 564/1484 | C: 281/1767
[LOSS Ex1] A: 0.65542 | B: 0.65476 | C: 0.64621
[LOGITS Ex2 A] Mean Abs: 1.807 | Max: 5.940
[LOSS Ex2] A: 0.17239 | B: 0.39300 | C: 0.29692
** [JOINT LOSS] ** : 0.939569
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006331 | Grad Max: 0.185356
  -> Layer: shared_layers.0.bias | Grad Mean: 0.452796 | Grad Max: 2.043562
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002339 | Grad Max: 0.007280
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010387 | Grad Max: 0.010387
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002930 | Grad Max: 0.307946
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054354 | Grad Max: 1.728766
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000511 | Grad Max: 0.016308
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026628 | Grad Max: 0.130027
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000730
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005924 | Grad Max: 0.012591
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000317
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001734 | Grad Max: 0.003745
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002022 | Grad Max: 0.004056
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039548 | Grad Max: 0.039548
[GRADIENT NORM TOTAL] 9.0571

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.618
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50037 0.49963] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 663/1385 | B: 557/1491 | C: 281/1767
[LOSS Ex1] A: 0.66052 | B: 0.65114 | C: 0.64864
[LOGITS Ex2 A] Mean Abs: 1.792 | Max: 6.058
[LOSS Ex2] A: 0.15685 | B: 0.36880 | C: 0.28514
** [JOINT LOSS] ** : 0.923692
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002810 | Grad Max: 0.089864
  -> Layer: shared_layers.0.bias | Grad Mean: 0.245488 | Grad Max: 1.133690
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006190
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001988 | Grad Max: 0.001988
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001562 | Grad Max: 0.180420
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029263 | Grad Max: 0.993164
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000271 | Grad Max: 0.009406
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014202 | Grad Max: 0.073577
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000442
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003081 | Grad Max: 0.007036
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000189
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000897 | Grad Max: 0.002088
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001010 | Grad Max: 0.002996
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020573 | Grad Max: 0.020573
[GRADIENT NORM TOTAL] 5.0335

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.405
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6363522 0.3636478] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.049
[MASKS] A(Pass/Fail): 647/1401 | B: 514/1342 | C: 266/1782
[LOSS Ex1] A: 0.65754 | B: 0.65470 | C: 0.65130
[LOGITS Ex2 A] Mean Abs: 1.734 | Max: 5.965
[LOSS Ex2] A: 0.18791 | B: 0.37988 | C: 0.31029
** [JOINT LOSS] ** : 0.947205
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005183 | Grad Max: 0.130958
  -> Layer: shared_layers.0.bias | Grad Mean: 0.310304 | Grad Max: 1.447340
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.006772
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006774 | Grad Max: 0.006774
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001950 | Grad Max: 0.310156
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036205 | Grad Max: 1.748706
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.010720
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017244 | Grad Max: 0.083989
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000482
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003777 | Grad Max: 0.008170
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000230
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001086 | Grad Max: 0.002631
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001235 | Grad Max: 0.002440
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023751 | Grad Max: 0.023751
[GRADIENT NORM TOTAL] 6.0913

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.480
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5862583 0.4137417] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.049
[MASKS] A(Pass/Fail): 534/1082 | B: 545/1503 | C: 290/1758
[LOSS Ex1] A: 0.65615 | B: 0.65392 | C: 0.65045
[LOGITS Ex2 A] Mean Abs: 1.785 | Max: 6.555
[LOSS Ex2] A: 0.17059 | B: 0.41277 | C: 0.29829
** [JOINT LOSS] ** : 0.947390
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007399 | Grad Max: 0.188933
  -> Layer: shared_layers.0.bias | Grad Mean: 0.531803 | Grad Max: 2.349885
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.006762
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006662 | Grad Max: 0.006662
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003376 | Grad Max: 0.454963
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.063376 | Grad Max: 2.553191
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000589 | Grad Max: 0.018130
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030820 | Grad Max: 0.144340
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000807
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006760 | Grad Max: 0.013431
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000398
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001961 | Grad Max: 0.004750
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002327 | Grad Max: 0.004136
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044817 | Grad Max: 0.044817
[GRADIENT NORM TOTAL] 10.5982

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.618
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50685745 0.49314258] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 664/1384 | B: 564/1484 | C: 301/1747
[LOSS Ex1] A: 0.65652 | B: 0.65466 | C: 0.64921
[LOGITS Ex2 A] Mean Abs: 1.780 | Max: 5.893
[LOSS Ex2] A: 0.17294 | B: 0.39546 | C: 0.27514
** [JOINT LOSS] ** : 0.934643
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004590 | Grad Max: 0.120034
  -> Layer: shared_layers.0.bias | Grad Mean: 0.267225 | Grad Max: 1.321267
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.006643
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004846 | Grad Max: 0.004846
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001698 | Grad Max: 0.257900
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031486 | Grad Max: 1.448287
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000285 | Grad Max: 0.008947
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014979 | Grad Max: 0.068971
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000482
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003292 | Grad Max: 0.008047
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000935 | Grad Max: 0.002373
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001099 | Grad Max: 0.002462
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021079 | Grad Max: 0.021079
[GRADIENT NORM TOTAL] 5.3768

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.569
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50995743 0.49004254] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 660/1388 | B: 557/1491 | C: 289/1759
[LOSS Ex1] A: 0.65441 | B: 0.65102 | C: 0.64850
[LOGITS Ex2 A] Mean Abs: 1.814 | Max: 6.168
[LOSS Ex2] A: 0.16168 | B: 0.37633 | C: 0.28547
** [JOINT LOSS] ** : 0.925804
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003805 | Grad Max: 0.108565
  -> Layer: shared_layers.0.bias | Grad Mean: 0.214512 | Grad Max: 0.964134
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.006625
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001721 | Grad Max: 0.001721
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001384 | Grad Max: 0.189326
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025592 | Grad Max: 1.030145
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.008496
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011962 | Grad Max: 0.060736
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000391
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002647 | Grad Max: 0.006341
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000162
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000768 | Grad Max: 0.001758
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000890 | Grad Max: 0.002761
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017431 | Grad Max: 0.017431
[GRADIENT NORM TOTAL] 4.3973

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.587
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022315  0.49776852] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 653/1395 | B: 514/1342 | C: 278/1770
[LOSS Ex1] A: 0.65342 | B: 0.65459 | C: 0.64859
[LOGITS Ex2 A] Mean Abs: 1.797 | Max: 6.628
[LOSS Ex2] A: 0.19436 | B: 0.36088 | C: 0.30498
** [JOINT LOSS] ** : 0.938940
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006531 | Grad Max: 0.198765
  -> Layer: shared_layers.0.bias | Grad Mean: 0.369131 | Grad Max: 1.593338
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.007165
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000329 | Grad Max: 0.000329
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002497 | Grad Max: 0.258218
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045909 | Grad Max: 1.447757
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000409 | Grad Max: 0.012028
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021223 | Grad Max: 0.092679
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000615
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004789 | Grad Max: 0.009741
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000303
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001394 | Grad Max: 0.003379
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001651 | Grad Max: 0.003096
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031659 | Grad Max: 0.031659
[GRADIENT NORM TOTAL] 7.4095

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.489
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041656  0.49583438] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.539 | Std: 0.048
[MASKS] A(Pass/Fail): 633/1415 | B: 545/1503 | C: 288/1760
[LOSS Ex1] A: 0.65988 | B: 0.65380 | C: 0.64681
[LOGITS Ex2 A] Mean Abs: 1.781 | Max: 5.850
[LOSS Ex2] A: 0.16469 | B: 0.38890 | C: 0.30645
** [JOINT LOSS] ** : 0.940177
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003774 | Grad Max: 0.099501
  -> Layer: shared_layers.0.bias | Grad Mean: 0.175529 | Grad Max: 0.828740
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.006074
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004666 | Grad Max: 0.004666
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001147 | Grad Max: 0.108907
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020803 | Grad Max: 0.598575
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000190 | Grad Max: 0.005523
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009675 | Grad Max: 0.044510
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000357
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002176 | Grad Max: 0.005668
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000147
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000636 | Grad Max: 0.001650
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000671 | Grad Max: 0.001771
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013352 | Grad Max: 0.013352
[GRADIENT NORM TOTAL] 3.3475

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.418
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53531635 0.46468365] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.047
[MASKS] A(Pass/Fail): 621/1427 | B: 564/1484 | C: 271/1777
[LOSS Ex1] A: 0.65965 | B: 0.65454 | C: 0.64938
[LOGITS Ex2 A] Mean Abs: 1.709 | Max: 7.028
[LOSS Ex2] A: 0.17704 | B: 0.40155 | C: 0.29410
** [JOINT LOSS] ** : 0.945420
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006334 | Grad Max: 0.148694
  -> Layer: shared_layers.0.bias | Grad Mean: 0.439906 | Grad Max: 1.924764
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.006306
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007059 | Grad Max: 0.007059
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002772 | Grad Max: 0.307066
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051944 | Grad Max: 1.721029
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000495 | Grad Max: 0.016312
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025869 | Grad Max: 0.126429
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000718
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005709 | Grad Max: 0.011651
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000347
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001658 | Grad Max: 0.003989
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001971 | Grad Max: 0.003400
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037770 | Grad Max: 0.037770
[GRADIENT NORM TOTAL] 8.5671

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.553
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.66844493 0.33155513] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.049
[MASKS] A(Pass/Fail): 688/1360 | B: 557/1491 | C: 292/1756
[LOSS Ex1] A: 0.65521 | B: 0.65091 | C: 0.64860
[LOGITS Ex2 A] Mean Abs: 1.749 | Max: 6.225
[LOSS Ex2] A: 0.16226 | B: 0.38335 | C: 0.29748
** [JOINT LOSS] ** : 0.932604
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006409 | Grad Max: 0.141052
  -> Layer: shared_layers.0.bias | Grad Mean: 0.468551 | Grad Max: 1.977912
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002275 | Grad Max: 0.006857
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005920 | Grad Max: 0.005920
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002991 | Grad Max: 0.384782
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056039 | Grad Max: 2.176695
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000525 | Grad Max: 0.017409
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027559 | Grad Max: 0.138257
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000787
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006064 | Grad Max: 0.012445
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000382
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001750 | Grad Max: 0.004608
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002010 | Grad Max: 0.003763
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038764 | Grad Max: 0.038764
[GRADIENT NORM TOTAL] 9.2913

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.620
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003841  0.49961594] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.048
[MASKS] A(Pass/Fail): 665/1383 | B: 516/1340 | C: 186/1190
[LOSS Ex1] A: 0.66034 | B: 0.65448 | C: 0.64935
[LOGITS Ex2 A] Mean Abs: 1.779 | Max: 5.459
[LOSS Ex2] A: 0.16420 | B: 0.37163 | C: 0.26742
** [JOINT LOSS] ** : 0.922477
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005492 | Grad Max: 0.183125
  -> Layer: shared_layers.0.bias | Grad Mean: 0.254672 | Grad Max: 1.016137
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.006698
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008878 | Grad Max: 0.008878
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001629 | Grad Max: 0.279880
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029007 | Grad Max: 1.548691
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000261 | Grad Max: 0.007124
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013412 | Grad Max: 0.050487
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000430
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003081 | Grad Max: 0.007048
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000214
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000883 | Grad Max: 0.002495
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001019 | Grad Max: 0.002672
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018819 | Grad Max: 0.018819
[GRADIENT NORM TOTAL] 4.9684

[EPOCH SUMMARY] Train Loss: 0.9372

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9190 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9196 -> New: 0.9190)

############################## EPOCH 87/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.407
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.637204   0.36279604] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 649/1399 | B: 548/1500 | C: 275/1773
[LOSS Ex1] A: 0.65735 | B: 0.65369 | C: 0.65106
[LOGITS Ex2 A] Mean Abs: 1.808 | Max: 5.934
[LOSS Ex2] A: 0.17742 | B: 0.39195 | C: 0.30089
** [JOINT LOSS] ** : 0.944119
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003559 | Grad Max: 0.096520
  -> Layer: shared_layers.0.bias | Grad Mean: 0.308446 | Grad Max: 1.166812
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006632
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006358 | Grad Max: 0.006358
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001896 | Grad Max: 0.214410
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035460 | Grad Max: 1.199634
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000346 | Grad Max: 0.011507
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018364 | Grad Max: 0.091302
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000624
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003972 | Grad Max: 0.009543
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000255
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001159 | Grad Max: 0.002901
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001334 | Grad Max: 0.002713
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026652 | Grad Max: 0.026652
[GRADIENT NORM TOTAL] 5.8777

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.483
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58676064 0.4132394 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.049
[MASKS] A(Pass/Fail): 534/1082 | B: 565/1483 | C: 278/1770
[LOSS Ex1] A: 0.65596 | B: 0.65443 | C: 0.64945
[LOGITS Ex2 A] Mean Abs: 1.844 | Max: 6.454
[LOSS Ex2] A: 0.16110 | B: 0.38836 | C: 0.28640
** [JOINT LOSS] ** : 0.931902
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004443 | Grad Max: 0.130053
  -> Layer: shared_layers.0.bias | Grad Mean: 0.379303 | Grad Max: 1.766080
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.006710
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007912 | Grad Max: 0.007912
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002321 | Grad Max: 0.276676
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043796 | Grad Max: 1.516453
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000404 | Grad Max: 0.013634
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021201 | Grad Max: 0.098411
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000632
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004606 | Grad Max: 0.009896
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000260
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001333 | Grad Max: 0.003114
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001447 | Grad Max: 0.003540
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029557 | Grad Max: 0.029557
[GRADIENT NORM TOTAL] 7.4833

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.621
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50687194 0.4931281 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 666/1382 | B: 559/1489 | C: 290/1758
[LOSS Ex1] A: 0.65633 | B: 0.65079 | C: 0.64846
[LOGITS Ex2 A] Mean Abs: 1.814 | Max: 7.673
[LOSS Ex2] A: 0.16213 | B: 0.37191 | C: 0.28593
** [JOINT LOSS] ** : 0.925181
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.070501
  -> Layer: shared_layers.0.bias | Grad Mean: 0.173462 | Grad Max: 0.864734
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002267 | Grad Max: 0.006957
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006290 | Grad Max: 0.006290
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001120 | Grad Max: 0.158128
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020619 | Grad Max: 0.888754
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.007213
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008896 | Grad Max: 0.044972
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000308
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001943 | Grad Max: 0.004689
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000131
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000573 | Grad Max: 0.001472
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000602 | Grad Max: 0.002067
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012977 | Grad Max: 0.012977
[GRADIENT NORM TOTAL] 3.7620

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.572
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5100233 0.4899767] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 661/1387 | B: 516/1340 | C: 273/1775
[LOSS Ex1] A: 0.65420 | B: 0.65436 | C: 0.64916
[LOGITS Ex2 A] Mean Abs: 1.769 | Max: 5.252
[LOSS Ex2] A: 0.17023 | B: 0.37819 | C: 0.31117
** [JOINT LOSS] ** : 0.939105
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003593 | Grad Max: 0.090492
  -> Layer: shared_layers.0.bias | Grad Mean: 0.294026 | Grad Max: 1.261735
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002229 | Grad Max: 0.007348
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005804 | Grad Max: 0.005804
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001824 | Grad Max: 0.201028
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034216 | Grad Max: 1.127064
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000317 | Grad Max: 0.010836
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016813 | Grad Max: 0.086741
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000458
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003716 | Grad Max: 0.008084
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000245
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001078 | Grad Max: 0.002602
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001194 | Grad Max: 0.002265
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023834 | Grad Max: 0.023834
[GRADIENT NORM TOTAL] 5.6853

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.590
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50227284 0.49772716] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 654/1394 | B: 548/1500 | C: 291/1757
[LOSS Ex1] A: 0.65321 | B: 0.65357 | C: 0.64698
[LOGITS Ex2 A] Mean Abs: 1.745 | Max: 6.299
[LOSS Ex2] A: 0.18290 | B: 0.40866 | C: 0.27180
** [JOINT LOSS] ** : 0.939041
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004799 | Grad Max: 0.137791
  -> Layer: shared_layers.0.bias | Grad Mean: 0.435849 | Grad Max: 1.968644
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002359 | Grad Max: 0.007715
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009492 | Grad Max: 0.009492
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002649 | Grad Max: 0.338881
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049730 | Grad Max: 1.883790
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.012680
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023729 | Grad Max: 0.107616
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000744
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005179 | Grad Max: 0.011471
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000297
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001488 | Grad Max: 0.003604
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001716 | Grad Max: 0.003135
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033219 | Grad Max: 0.033219
[GRADIENT NORM TOTAL] 8.6420

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.491
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50416994 0.49583006] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 634/1414 | B: 565/1483 | C: 303/1745
[LOSS Ex1] A: 0.65971 | B: 0.65431 | C: 0.64660
[LOGITS Ex2 A] Mean Abs: 1.751 | Max: 5.897
[LOSS Ex2] A: 0.15760 | B: 0.39136 | C: 0.29270
** [JOINT LOSS] ** : 0.934095
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.065099
  -> Layer: shared_layers.0.bias | Grad Mean: 0.177111 | Grad Max: 0.818712
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002060 | Grad Max: 0.006543
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005871 | Grad Max: 0.005871
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001158 | Grad Max: 0.260585
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021575 | Grad Max: 1.456546
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000201 | Grad Max: 0.007524
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010544 | Grad Max: 0.052393
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000343
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002289 | Grad Max: 0.005243
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000144
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000672 | Grad Max: 0.001586
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000838 | Grad Max: 0.002104
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015912 | Grad Max: 0.015912
[GRADIENT NORM TOTAL] 3.8582

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.421
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5354926  0.46450746] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.047
[MASKS] A(Pass/Fail): 621/1427 | B: 561/1487 | C: 267/1781
[LOSS Ex1] A: 0.65948 | B: 0.65066 | C: 0.64969
[LOGITS Ex2 A] Mean Abs: 1.772 | Max: 6.067
[LOSS Ex2] A: 0.17596 | B: 0.37378 | C: 0.29008
** [JOINT LOSS] ** : 0.933217
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005486 | Grad Max: 0.124556
  -> Layer: shared_layers.0.bias | Grad Mean: 0.353670 | Grad Max: 1.541149
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.006160
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004101 | Grad Max: 0.004101
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002282 | Grad Max: 0.239072
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042522 | Grad Max: 1.349183
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.012806
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021003 | Grad Max: 0.100265
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000588
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004615 | Grad Max: 0.009643
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000274
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001341 | Grad Max: 0.003067
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001584 | Grad Max: 0.003404
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030757 | Grad Max: 0.030757
[GRADIENT NORM TOTAL] 6.9118

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.556
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6695137  0.33048636] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.049
[MASKS] A(Pass/Fail): 690/1358 | B: 516/1340 | C: 279/1769
[LOSS Ex1] A: 0.65503 | B: 0.65424 | C: 0.64839
[LOGITS Ex2 A] Mean Abs: 1.813 | Max: 5.721
[LOSS Ex2] A: 0.17522 | B: 0.36961 | C: 0.30906
** [JOINT LOSS] ** : 0.937182
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008513 | Grad Max: 0.217738
  -> Layer: shared_layers.0.bias | Grad Mean: 0.522860 | Grad Max: 2.324744
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.006747
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003171 | Grad Max: 0.003171
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003417 | Grad Max: 0.339107
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.063679 | Grad Max: 1.912306
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000602 | Grad Max: 0.018664
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031658 | Grad Max: 0.148125
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000852
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007024 | Grad Max: 0.014215
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000365
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002052 | Grad Max: 0.004516
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002461 | Grad Max: 0.004801
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046732 | Grad Max: 0.046732
[GRADIENT NORM TOTAL] 10.1643

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.624
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004 0.4996] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 666/1382 | B: 548/1500 | C: 295/1753
[LOSS Ex1] A: 0.66017 | B: 0.65345 | C: 0.64648
[LOGITS Ex2 A] Mean Abs: 1.803 | Max: 5.461
[LOSS Ex2] A: 0.16647 | B: 0.38752 | C: 0.29881
** [JOINT LOSS] ** : 0.937636
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004660 | Grad Max: 0.095340
  -> Layer: shared_layers.0.bias | Grad Mean: 0.279307 | Grad Max: 1.260947
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.005858
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002755 | Grad Max: 0.002755
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001859 | Grad Max: 0.203356
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034720 | Grad Max: 1.102229
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000342 | Grad Max: 0.010504
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017815 | Grad Max: 0.089640
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000511
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003889 | Grad Max: 0.008303
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000222
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001122 | Grad Max: 0.002465
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001258 | Grad Max: 0.002739
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024513 | Grad Max: 0.024513
[GRADIENT NORM TOTAL] 5.5065

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.410
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6380608  0.36193916] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 652/1396 | B: 565/1483 | C: 292/1756
[LOSS Ex1] A: 0.65716 | B: 0.65420 | C: 0.64826
[LOGITS Ex2 A] Mean Abs: 1.748 | Max: 5.976
[LOSS Ex2] A: 0.17770 | B: 0.40218 | C: 0.28812
** [JOINT LOSS] ** : 0.942539
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005468 | Grad Max: 0.134870
  -> Layer: shared_layers.0.bias | Grad Mean: 0.341299 | Grad Max: 1.506555
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.006175
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000246 | Grad Max: 0.000246
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002238 | Grad Max: 0.238635
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041495 | Grad Max: 1.346310
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000393 | Grad Max: 0.012202
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020600 | Grad Max: 0.101092
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000594
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004586 | Grad Max: 0.009353
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000253
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001350 | Grad Max: 0.003033
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001666 | Grad Max: 0.003016
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031322 | Grad Max: 0.031322
[GRADIENT NORM TOTAL] 6.7023

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.485
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5873356 0.4126644] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.049
[MASKS] A(Pass/Fail): 535/1081 | B: 561/1487 | C: 282/1766
[LOSS Ex1] A: 0.65576 | B: 0.65054 | C: 0.64664
[LOGITS Ex2 A] Mean Abs: 1.763 | Max: 6.308
[LOSS Ex2] A: 0.17642 | B: 0.39394 | C: 0.30397
** [JOINT LOSS] ** : 0.942427
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008920 | Grad Max: 0.227429
  -> Layer: shared_layers.0.bias | Grad Mean: 0.542684 | Grad Max: 2.246858
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002315 | Grad Max: 0.007084
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007449 | Grad Max: 0.007449
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003537 | Grad Max: 0.389215
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065893 | Grad Max: 2.121088
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.018073
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.032201 | Grad Max: 0.151392
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000961
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007195 | Grad Max: 0.015220
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000439
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002099 | Grad Max: 0.004959
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002559 | Grad Max: 0.004951
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048121 | Grad Max: 0.048121
[GRADIENT NORM TOTAL] 10.6579

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.625
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068829  0.49311706] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 670/1378 | B: 516/1340 | C: 267/1781
[LOSS Ex1] A: 0.65614 | B: 0.65413 | C: 0.65117
[LOGITS Ex2 A] Mean Abs: 1.771 | Max: 6.126
[LOSS Ex2] A: 0.17012 | B: 0.36879 | C: 0.30820
** [JOINT LOSS] ** : 0.936186
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007255 | Grad Max: 0.200666
  -> Layer: shared_layers.0.bias | Grad Mean: 0.326862 | Grad Max: 1.365790
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002247 | Grad Max: 0.007333
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011881 | Grad Max: 0.011881
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002237 | Grad Max: 0.278151
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041240 | Grad Max: 1.488259
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000379 | Grad Max: 0.011959
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019646 | Grad Max: 0.086279
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000585
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004427 | Grad Max: 0.009089
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000296
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001280 | Grad Max: 0.003381
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001527 | Grad Max: 0.003131
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028702 | Grad Max: 0.028702
[GRADIENT NORM TOTAL] 6.5351

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.576
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51012135 0.48987868] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.049
[MASKS] A(Pass/Fail): 661/1387 | B: 548/1500 | C: 268/1780
[LOSS Ex1] A: 0.65400 | B: 0.65334 | C: 0.65054
[LOGITS Ex2 A] Mean Abs: 1.804 | Max: 5.776
[LOSS Ex2] A: 0.16547 | B: 0.38978 | C: 0.30215
** [JOINT LOSS] ** : 0.938429
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002877 | Grad Max: 0.081363
  -> Layer: shared_layers.0.bias | Grad Mean: 0.189537 | Grad Max: 0.731566
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006795
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001722 | Grad Max: 0.001722
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001203 | Grad Max: 0.145758
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022416 | Grad Max: 0.813338
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000194 | Grad Max: 0.007963
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010252 | Grad Max: 0.057142
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000375
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002240 | Grad Max: 0.005882
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000162
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000636 | Grad Max: 0.001913
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000647 | Grad Max: 0.001564
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013012 | Grad Max: 0.013012
[GRADIENT NORM TOTAL] 3.7778

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.594
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022637  0.49773625] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 655/1393 | B: 565/1483 | C: 170/1206
[LOSS Ex1] A: 0.65301 | B: 0.65409 | C: 0.65303
[LOGITS Ex2 A] Mean Abs: 1.781 | Max: 6.184
[LOSS Ex2] A: 0.18642 | B: 0.38807 | C: 0.29124
** [JOINT LOSS] ** : 0.941951
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005111 | Grad Max: 0.161090
  -> Layer: shared_layers.0.bias | Grad Mean: 0.325026 | Grad Max: 1.468766
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.006322
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002456 | Grad Max: 0.002456
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.231702
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040107 | Grad Max: 1.289719
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.011232
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018731 | Grad Max: 0.084171
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000566
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004192 | Grad Max: 0.009061
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000256
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001220 | Grad Max: 0.002960
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001392 | Grad Max: 0.003004
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027080 | Grad Max: 0.027080
[GRADIENT NORM TOTAL] 6.5852

[EPOCH SUMMARY] Train Loss: 0.9374

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9143 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9190 -> New: 0.9143)

############################## EPOCH 88/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.494
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50413597 0.49586403] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.049
[MASKS] A(Pass/Fail): 634/1414 | B: 562/1486 | C: 297/1751
[LOSS Ex1] A: 0.65954 | B: 0.65042 | C: 0.64710
[LOGITS Ex2 A] Mean Abs: 1.759 | Max: 6.446
[LOSS Ex2] A: 0.16386 | B: 0.36679 | C: 0.28866
** [JOINT LOSS] ** : 0.925460
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003519 | Grad Max: 0.088640
  -> Layer: shared_layers.0.bias | Grad Mean: 0.214625 | Grad Max: 0.993426
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002196 | Grad Max: 0.006277
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004747 | Grad Max: 0.004747
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001371 | Grad Max: 0.163329
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025114 | Grad Max: 0.924035
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000230 | Grad Max: 0.006692
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012113 | Grad Max: 0.054974
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000441
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002691 | Grad Max: 0.006190
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000169
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000798 | Grad Max: 0.001766
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000917 | Grad Max: 0.002763
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018604 | Grad Max: 0.018604
[GRADIENT NORM TOTAL] 4.3379

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.424
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5357268  0.46427318] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.539 | Std: 0.047
[MASKS] A(Pass/Fail): 623/1425 | B: 516/1340 | C: 252/1796
[LOSS Ex1] A: 0.65931 | B: 0.65402 | C: 0.65026
[LOGITS Ex2 A] Mean Abs: 1.687 | Max: 6.797
[LOSS Ex2] A: 0.18148 | B: 0.37605 | C: 0.29146
** [JOINT LOSS] ** : 0.937525
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004688 | Grad Max: 0.099575
  -> Layer: shared_layers.0.bias | Grad Mean: 0.361064 | Grad Max: 1.458034
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.006445
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000953 | Grad Max: 0.000953
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.286295
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042041 | Grad Max: 1.616733
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.010292
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020465 | Grad Max: 0.088120
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000571
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004489 | Grad Max: 0.009130
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000270
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001299 | Grad Max: 0.003112
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001507 | Grad Max: 0.002935
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028797 | Grad Max: 0.028797
[GRADIENT NORM TOTAL] 7.0741

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.559
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.67064774 0.3293523 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.050
[MASKS] A(Pass/Fail): 690/1358 | B: 549/1499 | C: 281/1767
[LOSS Ex1] A: 0.65483 | B: 0.65323 | C: 0.64981
[LOGITS Ex2 A] Mean Abs: 1.732 | Max: 6.792
[LOSS Ex2] A: 0.16240 | B: 0.40955 | C: 0.32043
** [JOINT LOSS] ** : 0.950077
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004741 | Grad Max: 0.147247
  -> Layer: shared_layers.0.bias | Grad Mean: 0.448967 | Grad Max: 1.945120
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.006882
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006011 | Grad Max: 0.006011
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002800 | Grad Max: 0.387232
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052805 | Grad Max: 2.158943
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000484 | Grad Max: 0.016521
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025842 | Grad Max: 0.136008
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000712
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005641 | Grad Max: 0.011705
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000333
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001650 | Grad Max: 0.003967
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001904 | Grad Max: 0.003809
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037689 | Grad Max: 0.037689
[GRADIENT NORM TOTAL] 9.1317

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.627
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004329  0.49956706] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 666/1382 | B: 565/1483 | C: 298/1750
[LOSS Ex1] A: 0.66000 | B: 0.65397 | C: 0.64641
[LOGITS Ex2 A] Mean Abs: 1.753 | Max: 5.523
[LOSS Ex2] A: 0.15736 | B: 0.38793 | C: 0.30467
** [JOINT LOSS] ** : 0.936781
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003939 | Grad Max: 0.105045
  -> Layer: shared_layers.0.bias | Grad Mean: 0.240508 | Grad Max: 0.977082
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.006120
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003783 | Grad Max: 0.003783
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001491 | Grad Max: 0.199540
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027716 | Grad Max: 1.110599
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000277 | Grad Max: 0.008911
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014582 | Grad Max: 0.066811
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000430
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003237 | Grad Max: 0.006753
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000254
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000956 | Grad Max: 0.002677
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001188 | Grad Max: 0.002725
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022506 | Grad Max: 0.022506
[GRADIENT NORM TOTAL] 4.4978

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.413
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63886786 0.36113217] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.050
[MASKS] A(Pass/Fail): 653/1395 | B: 562/1486 | C: 270/1778
[LOSS Ex1] A: 0.65698 | B: 0.65030 | C: 0.64864
[LOGITS Ex2 A] Mean Abs: 1.786 | Max: 5.612
[LOSS Ex2] A: 0.17813 | B: 0.37216 | C: 0.29210
** [JOINT LOSS] ** : 0.932772
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003829 | Grad Max: 0.121108
  -> Layer: shared_layers.0.bias | Grad Mean: 0.315441 | Grad Max: 1.454433
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.006318
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005170 | Grad Max: 0.005170
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.240766
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038252 | Grad Max: 1.346906
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000347 | Grad Max: 0.011724
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018478 | Grad Max: 0.090099
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000572
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004080 | Grad Max: 0.008662
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000245
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001190 | Grad Max: 0.002668
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001341 | Grad Max: 0.003236
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026939 | Grad Max: 0.026939
[GRADIENT NORM TOTAL] 6.6164

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.488
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58776844 0.4122315 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 535/1081 | B: 516/1340 | C: 282/1766
[LOSS Ex1] A: 0.65557 | B: 0.65391 | C: 0.64939
[LOGITS Ex2 A] Mean Abs: 1.835 | Max: 5.486
[LOSS Ex2] A: 0.16911 | B: 0.37494 | C: 0.28732
** [JOINT LOSS] ** : 0.930078
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006982 | Grad Max: 0.182644
  -> Layer: shared_layers.0.bias | Grad Mean: 0.508850 | Grad Max: 2.282396
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.006154
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003739 | Grad Max: 0.003739
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003271 | Grad Max: 0.320944
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.061010 | Grad Max: 1.822636
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000568 | Grad Max: 0.016976
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030092 | Grad Max: 0.140125
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000830
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006661 | Grad Max: 0.014046
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000365
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001948 | Grad Max: 0.004428
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002188 | Grad Max: 0.004567
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043909 | Grad Max: 0.043909
[GRADIENT NORM TOTAL] 10.0762

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.628
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50687426 0.4931257 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 671/1377 | B: 549/1499 | C: 306/1742
[LOSS Ex1] A: 0.65595 | B: 0.65312 | C: 0.64415
[LOGITS Ex2 A] Mean Abs: 1.802 | Max: 7.064
[LOSS Ex2] A: 0.16285 | B: 0.39494 | C: 0.28246
** [JOINT LOSS] ** : 0.931154
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005460 | Grad Max: 0.137292
  -> Layer: shared_layers.0.bias | Grad Mean: 0.379656 | Grad Max: 1.679972
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002215 | Grad Max: 0.006532
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001494 | Grad Max: 0.001494
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002406 | Grad Max: 0.242965
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045062 | Grad Max: 1.347069
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.012201
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022797 | Grad Max: 0.106665
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000694
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005066 | Grad Max: 0.011175
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000269
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001499 | Grad Max: 0.003319
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001714 | Grad Max: 0.003835
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034480 | Grad Max: 0.034480
[GRADIENT NORM TOTAL] 7.3393

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.579
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5101537  0.48984632] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 662/1386 | B: 565/1483 | C: 280/1768
[LOSS Ex1] A: 0.65379 | B: 0.65387 | C: 0.64962
[LOGITS Ex2 A] Mean Abs: 1.758 | Max: 5.864
[LOSS Ex2] A: 0.16135 | B: 0.39176 | C: 0.28328
** [JOINT LOSS] ** : 0.931222
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001968 | Grad Max: 0.037427
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134070 | Grad Max: 0.608085
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002324 | Grad Max: 0.007373
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012715 | Grad Max: 0.012715
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000888 | Grad Max: 0.230786
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016432 | Grad Max: 1.297672
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.006254
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006855 | Grad Max: 0.039363
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000287
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001461 | Grad Max: 0.003995
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000125
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000408 | Grad Max: 0.001259
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000519 | Grad Max: 0.001753
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008697 | Grad Max: 0.008697
[GRADIENT NORM TOTAL] 3.0670

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.597
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022763  0.49772373] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 655/1393 | B: 562/1486 | C: 295/1753
[LOSS Ex1] A: 0.65280 | B: 0.65018 | C: 0.64545
[LOGITS Ex2 A] Mean Abs: 1.728 | Max: 7.048
[LOSS Ex2] A: 0.18527 | B: 0.38772 | C: 0.30128
** [JOINT LOSS] ** : 0.940904
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004676 | Grad Max: 0.117181
  -> Layer: shared_layers.0.bias | Grad Mean: 0.340410 | Grad Max: 1.443603
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002379 | Grad Max: 0.007232
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001298 | Grad Max: 0.001298
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002075 | Grad Max: 0.234378
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039086 | Grad Max: 1.331246
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000371 | Grad Max: 0.012858
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019713 | Grad Max: 0.104805
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000560
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004284 | Grad Max: 0.008849
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000267
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001239 | Grad Max: 0.002907
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001443 | Grad Max: 0.002687
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027674 | Grad Max: 0.027674
[GRADIENT NORM TOTAL] 6.4536

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.497
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50411165 0.49588835] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.049
[MASKS] A(Pass/Fail): 636/1412 | B: 516/1340 | C: 290/1758
[LOSS Ex1] A: 0.65936 | B: 0.65379 | C: 0.64717
[LOGITS Ex2 A] Mean Abs: 1.726 | Max: 5.638
[LOSS Ex2] A: 0.16433 | B: 0.36436 | C: 0.25821
** [JOINT LOSS] ** : 0.915744
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003761 | Grad Max: 0.125368
  -> Layer: shared_layers.0.bias | Grad Mean: 0.193989 | Grad Max: 0.781740
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005920
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005297 | Grad Max: 0.005297
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001352 | Grad Max: 0.155480
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024765 | Grad Max: 0.867581
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000231 | Grad Max: 0.006785
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012166 | Grad Max: 0.049139
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000420
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002686 | Grad Max: 0.006568
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000205
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000756 | Grad Max: 0.002129
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000866 | Grad Max: 0.002695
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015402 | Grad Max: 0.015402
[GRADIENT NORM TOTAL] 3.8746

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.427
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5358893 0.4641107] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 624/1424 | B: 550/1498 | C: 262/1786
[LOSS Ex1] A: 0.65913 | B: 0.65300 | C: 0.65212
[LOGITS Ex2 A] Mean Abs: 1.763 | Max: 6.334
[LOSS Ex2] A: 0.17689 | B: 0.39156 | C: 0.32236
** [JOINT LOSS] ** : 0.951688
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005359 | Grad Max: 0.137401
  -> Layer: shared_layers.0.bias | Grad Mean: 0.346765 | Grad Max: 1.530022
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002016 | Grad Max: 0.005916
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004178 | Grad Max: 0.004178
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.273512
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041928 | Grad Max: 1.478631
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000390 | Grad Max: 0.012854
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020717 | Grad Max: 0.103062
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000621
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004585 | Grad Max: 0.010031
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000263
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001336 | Grad Max: 0.003247
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001483 | Grad Max: 0.003062
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029097 | Grad Max: 0.029097
[GRADIENT NORM TOTAL] 6.8930

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.562
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.67175174 0.32824826] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.050
[MASKS] A(Pass/Fail): 691/1357 | B: 565/1483 | C: 266/1782
[LOSS Ex1] A: 0.65463 | B: 0.65374 | C: 0.65177
[LOGITS Ex2 A] Mean Abs: 1.805 | Max: 6.222
[LOSS Ex2] A: 0.17335 | B: 0.39492 | C: 0.29689
** [JOINT LOSS] ** : 0.941770
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007887 | Grad Max: 0.195482
  -> Layer: shared_layers.0.bias | Grad Mean: 0.485248 | Grad Max: 1.911145
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.007003
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005712 | Grad Max: 0.005712
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003086 | Grad Max: 0.386377
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057230 | Grad Max: 2.155375
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000548 | Grad Max: 0.016273
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028924 | Grad Max: 0.135086
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000833
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006425 | Grad Max: 0.013326
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000363
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001867 | Grad Max: 0.004274
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002147 | Grad Max: 0.004164
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041356 | Grad Max: 0.041356
[GRADIENT NORM TOTAL] 9.4567

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.630
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004511  0.49954888] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 668/1380 | B: 562/1486 | C: 270/1778
[LOSS Ex1] A: 0.65982 | B: 0.65006 | C: 0.64927
[LOGITS Ex2 A] Mean Abs: 1.787 | Max: 5.549
[LOSS Ex2] A: 0.16262 | B: 0.36887 | C: 0.30319
** [JOINT LOSS] ** : 0.931278
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002754 | Grad Max: 0.072637
  -> Layer: shared_layers.0.bias | Grad Mean: 0.232772 | Grad Max: 0.995903
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005831
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000160 | Grad Max: 0.000160
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001483 | Grad Max: 0.180704
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027693 | Grad Max: 1.006372
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.008853
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013686 | Grad Max: 0.072554
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000407
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002901 | Grad Max: 0.006488
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000201
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000825 | Grad Max: 0.002063
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000873 | Grad Max: 0.002324
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017533 | Grad Max: 0.017533
[GRADIENT NORM TOTAL] 4.7181

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.416
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6397509  0.36024916] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.050
[MASKS] A(Pass/Fail): 653/1395 | B: 516/1340 | C: 191/1185
[LOSS Ex1] A: 0.65679 | B: 0.65367 | C: 0.64647
[LOGITS Ex2 A] Mean Abs: 1.743 | Max: 5.662
[LOSS Ex2] A: 0.17501 | B: 0.35933 | C: 0.26676
** [JOINT LOSS] ** : 0.919342
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005578 | Grad Max: 0.143736
  -> Layer: shared_layers.0.bias | Grad Mean: 0.359004 | Grad Max: 1.523907
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.006163
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002703 | Grad Max: 0.002703
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.272056
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041510 | Grad Max: 1.506926
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000385 | Grad Max: 0.012969
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020292 | Grad Max: 0.094658
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000672
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004520 | Grad Max: 0.010558
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000279
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001286 | Grad Max: 0.003220
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001371 | Grad Max: 0.003098
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026696 | Grad Max: 0.026696
[GRADIENT NORM TOTAL] 6.8379

[EPOCH SUMMARY] Train Loss: 0.9340

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9240 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 89/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.491
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58833754 0.41166246] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.050
[MASKS] A(Pass/Fail): 536/1080 | B: 551/1497 | C: 284/1764
[LOSS Ex1] A: 0.65537 | B: 0.65288 | C: 0.64650
[LOGITS Ex2 A] Mean Abs: 1.780 | Max: 6.255
[LOSS Ex2] A: 0.16210 | B: 0.40528 | C: 0.28074
** [JOINT LOSS] ** : 0.934292
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007708 | Grad Max: 0.180835
  -> Layer: shared_layers.0.bias | Grad Mean: 0.540948 | Grad Max: 2.347214
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.006916
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006157 | Grad Max: 0.006157
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003491 | Grad Max: 0.423662
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065319 | Grad Max: 2.346340
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000614 | Grad Max: 0.019192
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.032584 | Grad Max: 0.157733
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000939
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007204 | Grad Max: 0.015549
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000414
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002118 | Grad Max: 0.004799
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002576 | Grad Max: 0.004473
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048624 | Grad Max: 0.048624
[GRADIENT NORM TOTAL] 10.8141

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.632
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068857  0.49311423] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 676/1372 | B: 565/1483 | C: 303/1745
[LOSS Ex1] A: 0.65576 | B: 0.65363 | C: 0.64508
[LOGITS Ex2 A] Mean Abs: 1.777 | Max: 6.566
[LOSS Ex2] A: 0.16049 | B: 0.38425 | C: 0.29052
** [JOINT LOSS] ** : 0.929910
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005261 | Grad Max: 0.162287
  -> Layer: shared_layers.0.bias | Grad Mean: 0.268404 | Grad Max: 1.307724
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002220 | Grad Max: 0.006445
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001417 | Grad Max: 0.001417
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001780 | Grad Max: 0.296751
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032514 | Grad Max: 1.648130
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000289 | Grad Max: 0.008801
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015222 | Grad Max: 0.069508
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000505
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003419 | Grad Max: 0.007643
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000227
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001000 | Grad Max: 0.002387
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001188 | Grad Max: 0.002460
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022507 | Grad Max: 0.022507
[GRADIENT NORM TOTAL] 5.7263

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.583
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5102206  0.48977938] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 662/1386 | B: 562/1486 | C: 274/1774
[LOSS Ex1] A: 0.65358 | B: 0.64994 | C: 0.64926
[LOGITS Ex2 A] Mean Abs: 1.809 | Max: 5.401
[LOSS Ex2] A: 0.16821 | B: 0.37383 | C: 0.31233
** [JOINT LOSS] ** : 0.935718
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003718 | Grad Max: 0.108989
  -> Layer: shared_layers.0.bias | Grad Mean: 0.225723 | Grad Max: 1.066452
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002292 | Grad Max: 0.006257
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004404 | Grad Max: 0.004404
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001479 | Grad Max: 0.177295
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027389 | Grad Max: 0.958216
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000228 | Grad Max: 0.007472
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012136 | Grad Max: 0.057174
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000382
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002653 | Grad Max: 0.006193
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000180
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000764 | Grad Max: 0.001720
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000860 | Grad Max: 0.001906
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016464 | Grad Max: 0.016464
[GRADIENT NORM TOTAL] 4.6244

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.601
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50229526 0.49770477] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 655/1393 | B: 516/1340 | C: 262/1786
[LOSS Ex1] A: 0.65259 | B: 0.65357 | C: 0.64951
[LOGITS Ex2 A] Mean Abs: 1.798 | Max: 6.552
[LOSS Ex2] A: 0.19418 | B: 0.35712 | C: 0.30117
** [JOINT LOSS] ** : 0.936050
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007357 | Grad Max: 0.239239
  -> Layer: shared_layers.0.bias | Grad Mean: 0.364584 | Grad Max: 1.490523
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002200 | Grad Max: 0.007132
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003510 | Grad Max: 0.003510
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002579 | Grad Max: 0.251047
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047050 | Grad Max: 1.421221
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000438 | Grad Max: 0.013202
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022957 | Grad Max: 0.100593
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000658
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005192 | Grad Max: 0.010654
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000315
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001522 | Grad Max: 0.003714
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001793 | Grad Max: 0.003720
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034316 | Grad Max: 0.034316
[GRADIENT NORM TOTAL] 7.3326

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.500
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50410783 0.49589217] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.049
[MASKS] A(Pass/Fail): 637/1411 | B: 551/1497 | C: 264/1784
[LOSS Ex1] A: 0.65918 | B: 0.65277 | C: 0.65008
[LOGITS Ex2 A] Mean Abs: 1.759 | Max: 5.241
[LOSS Ex2] A: 0.16320 | B: 0.38386 | C: 0.29659
** [JOINT LOSS] ** : 0.935228
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003558 | Grad Max: 0.102268
  -> Layer: shared_layers.0.bias | Grad Mean: 0.126095 | Grad Max: 0.515407
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005923
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000333 | Grad Max: 0.000333
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000892 | Grad Max: 0.105907
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015735 | Grad Max: 0.594537
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000139 | Grad Max: 0.004896
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007192 | Grad Max: 0.029379
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000308
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001673 | Grad Max: 0.004409
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000146
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000479 | Grad Max: 0.001476
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000519 | Grad Max: 0.001927
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010289 | Grad Max: 0.010289
[GRADIENT NORM TOTAL] 2.5176

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.429
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5360387 0.4639613] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 627/1421 | B: 565/1483 | C: 292/1756
[LOSS Ex1] A: 0.65895 | B: 0.65352 | C: 0.64814
[LOGITS Ex2 A] Mean Abs: 1.698 | Max: 6.273
[LOSS Ex2] A: 0.17750 | B: 0.40434 | C: 0.28563
** [JOINT LOSS] ** : 0.942691
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006611 | Grad Max: 0.189470
  -> Layer: shared_layers.0.bias | Grad Mean: 0.455846 | Grad Max: 1.972015
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002037 | Grad Max: 0.006430
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005476 | Grad Max: 0.005476
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002872 | Grad Max: 0.319782
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053902 | Grad Max: 1.748670
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000490 | Grad Max: 0.013598
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026048 | Grad Max: 0.116962
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000672
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005757 | Grad Max: 0.011844
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000347
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001679 | Grad Max: 0.004190
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001963 | Grad Max: 0.003587
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037580 | Grad Max: 0.037580
[GRADIENT NORM TOTAL] 8.8648

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.565
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6728455  0.32715458] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.050
[MASKS] A(Pass/Fail): 692/1356 | B: 563/1485 | C: 286/1762
[LOSS Ex1] A: 0.65443 | B: 0.64982 | C: 0.64606
[LOGITS Ex2 A] Mean Abs: 1.730 | Max: 6.385
[LOSS Ex2] A: 0.16184 | B: 0.38960 | C: 0.29822
** [JOINT LOSS] ** : 0.933320
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006211 | Grad Max: 0.161845
  -> Layer: shared_layers.0.bias | Grad Mean: 0.515156 | Grad Max: 2.238544
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002343 | Grad Max: 0.007341
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008466 | Grad Max: 0.008466
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003221 | Grad Max: 0.378496
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060910 | Grad Max: 2.118878
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000555 | Grad Max: 0.018269
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029736 | Grad Max: 0.151278
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000878
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006528 | Grad Max: 0.014313
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000375
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001904 | Grad Max: 0.004163
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002230 | Grad Max: 0.004024
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042457 | Grad Max: 0.042457
[GRADIENT NORM TOTAL] 10.2571

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.634
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004217  0.49957833] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 668/1380 | B: 517/1339 | C: 276/1772
[LOSS Ex1] A: 0.65964 | B: 0.65345 | C: 0.64794
[LOGITS Ex2 A] Mean Abs: 1.745 | Max: 5.429
[LOSS Ex2] A: 0.16552 | B: 0.36623 | C: 0.27994
** [JOINT LOSS] ** : 0.924241
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006327 | Grad Max: 0.179407
  -> Layer: shared_layers.0.bias | Grad Mean: 0.369146 | Grad Max: 1.552226
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.006021
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000680 | Grad Max: 0.000680
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002446 | Grad Max: 0.289744
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045210 | Grad Max: 1.612117
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000418 | Grad Max: 0.012052
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022203 | Grad Max: 0.104997
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000579
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004975 | Grad Max: 0.010512
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000297
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001441 | Grad Max: 0.003341
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001614 | Grad Max: 0.003457
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031305 | Grad Max: 0.031305
[GRADIENT NORM TOTAL] 7.2950

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.419
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64049816 0.3595018 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.050
[MASKS] A(Pass/Fail): 653/1395 | B: 551/1497 | C: 291/1757
[LOSS Ex1] A: 0.65659 | B: 0.65266 | C: 0.64820
[LOGITS Ex2 A] Mean Abs: 1.786 | Max: 5.423
[LOSS Ex2] A: 0.17428 | B: 0.38755 | C: 0.28885
** [JOINT LOSS] ** : 0.936042
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002637 | Grad Max: 0.083023
  -> Layer: shared_layers.0.bias | Grad Mean: 0.203529 | Grad Max: 0.979730
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.006228
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000059 | Grad Max: 0.000059
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001327 | Grad Max: 0.149643
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024247 | Grad Max: 0.840486
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000233 | Grad Max: 0.009172
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012307 | Grad Max: 0.072153
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000355
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002671 | Grad Max: 0.006426
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000164
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000773 | Grad Max: 0.001893
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000830 | Grad Max: 0.002121
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016893 | Grad Max: 0.016893
[GRADIENT NORM TOTAL] 4.1577

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.494
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58872885 0.41127118] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.050
[MASKS] A(Pass/Fail): 536/1080 | B: 565/1483 | C: 308/1740
[LOSS Ex1] A: 0.65517 | B: 0.65341 | C: 0.64655
[LOGITS Ex2 A] Mean Abs: 1.853 | Max: 5.929
[LOSS Ex2] A: 0.16426 | B: 0.39384 | C: 0.30217
** [JOINT LOSS] ** : 0.938467
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002924 | Grad Max: 0.101850
  -> Layer: shared_layers.0.bias | Grad Mean: 0.296639 | Grad Max: 1.358741
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.007134
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006572 | Grad Max: 0.006572
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001865 | Grad Max: 0.226990
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034861 | Grad Max: 1.284668
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000318 | Grad Max: 0.010935
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016990 | Grad Max: 0.083918
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000528
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003686 | Grad Max: 0.008070
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000225
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001059 | Grad Max: 0.002551
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001122 | Grad Max: 0.002615
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022607 | Grad Max: 0.022607
[GRADIENT NORM TOTAL] 6.1741

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.635
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50692725 0.49307272] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 676/1372 | B: 569/1479 | C: 274/1774
[LOSS Ex1] A: 0.65556 | B: 0.64970 | C: 0.64940
[LOGITS Ex2 A] Mean Abs: 1.812 | Max: 7.043
[LOSS Ex2] A: 0.16341 | B: 0.36356 | C: 0.29296
** [JOINT LOSS] ** : 0.924861
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002577 | Grad Max: 0.102322
  -> Layer: shared_layers.0.bias | Grad Mean: 0.209497 | Grad Max: 1.063770
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002282 | Grad Max: 0.006938
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008314 | Grad Max: 0.008314
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001334 | Grad Max: 0.162416
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024295 | Grad Max: 0.905910
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000212 | Grad Max: 0.008167
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011216 | Grad Max: 0.061102
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000364
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002368 | Grad Max: 0.005306
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000151
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000695 | Grad Max: 0.001667
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000734 | Grad Max: 0.002261
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015941 | Grad Max: 0.015941
[GRADIENT NORM TOTAL] 4.3274

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.586
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51020205 0.489798  ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 663/1385 | B: 524/1332 | C: 269/1779
[LOSS Ex1] A: 0.65337 | B: 0.65334 | C: 0.64923
[LOGITS Ex2 A] Mean Abs: 1.773 | Max: 5.707
[LOSS Ex2] A: 0.16775 | B: 0.37458 | C: 0.29446
** [JOINT LOSS] ** : 0.930908
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004765 | Grad Max: 0.115176
  -> Layer: shared_layers.0.bias | Grad Mean: 0.370177 | Grad Max: 1.641537
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006922
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006638 | Grad Max: 0.006638
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002386 | Grad Max: 0.276886
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044732 | Grad Max: 1.554258
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.013308
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021798 | Grad Max: 0.107491
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000652
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004785 | Grad Max: 0.009963
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000283
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001381 | Grad Max: 0.003314
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001568 | Grad Max: 0.003252
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029954 | Grad Max: 0.029954
[GRADIENT NORM TOTAL] 7.4284

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.604
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023503 0.4976497] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 655/1393 | B: 555/1493 | C: 270/1778
[LOSS Ex1] A: 0.65238 | B: 0.65254 | C: 0.64815
[LOGITS Ex2 A] Mean Abs: 1.730 | Max: 5.575
[LOSS Ex2] A: 0.17806 | B: 0.40392 | C: 0.30219
** [JOINT LOSS] ** : 0.945746
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004321 | Grad Max: 0.141460
  -> Layer: shared_layers.0.bias | Grad Mean: 0.391682 | Grad Max: 1.888290
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002310 | Grad Max: 0.007275
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007911 | Grad Max: 0.007911
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002441 | Grad Max: 0.396415
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045784 | Grad Max: 2.227519
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000413 | Grad Max: 0.013161
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022146 | Grad Max: 0.109318
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000590
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004763 | Grad Max: 0.010000
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000292
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001384 | Grad Max: 0.003357
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001626 | Grad Max: 0.002874
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031693 | Grad Max: 0.031693
[GRADIENT NORM TOTAL] 8.0784

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.503
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041085 0.4958915] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.540 | Std: 0.049
[MASKS] A(Pass/Fail): 637/1411 | B: 568/1480 | C: 192/1184
[LOSS Ex1] A: 0.65899 | B: 0.65329 | C: 0.64767
[LOGITS Ex2 A] Mean Abs: 1.742 | Max: 6.067
[LOSS Ex2] A: 0.16596 | B: 0.38899 | C: 0.30213
** [JOINT LOSS] ** : 0.939016
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002659 | Grad Max: 0.071114
  -> Layer: shared_layers.0.bias | Grad Mean: 0.194236 | Grad Max: 0.970032
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002050 | Grad Max: 0.005473
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001616 | Grad Max: 0.001616
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001342 | Grad Max: 0.246155
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024733 | Grad Max: 1.377778
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000228 | Grad Max: 0.006988
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012167 | Grad Max: 0.055253
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000370
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002639 | Grad Max: 0.006071
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000180
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000765 | Grad Max: 0.001971
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000955 | Grad Max: 0.002282
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017740 | Grad Max: 0.017740
[GRADIENT NORM TOTAL] 4.2723

[EPOCH SUMMARY] Train Loss: 0.9347

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9180 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 90/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.432
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53612417 0.46387586] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 627/1421 | B: 574/1474 | C: 283/1765
[LOSS Ex1] A: 0.65877 | B: 0.64958 | C: 0.64678
[LOGITS Ex2 A] Mean Abs: 1.777 | Max: 5.850
[LOSS Ex2] A: 0.17378 | B: 0.37204 | C: 0.27759
** [JOINT LOSS] ** : 0.926179
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005912 | Grad Max: 0.151513
  -> Layer: shared_layers.0.bias | Grad Mean: 0.371956 | Grad Max: 1.551225
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.006238
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002166 | Grad Max: 0.002166
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002311 | Grad Max: 0.236688
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042941 | Grad Max: 1.320596
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.012821
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021394 | Grad Max: 0.105086
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000602
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004759 | Grad Max: 0.009457
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000267
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001406 | Grad Max: 0.003068
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001631 | Grad Max: 0.003781
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032212 | Grad Max: 0.032212
[GRADIENT NORM TOTAL] 7.0313

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.568
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6739083  0.32609177] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 693/1355 | B: 524/1332 | C: 277/1771
[LOSS Ex1] A: 0.65423 | B: 0.65322 | C: 0.64856
[LOGITS Ex2 A] Mean Abs: 1.816 | Max: 5.455
[LOSS Ex2] A: 0.17030 | B: 0.37039 | C: 0.26911
** [JOINT LOSS] ** : 0.921939
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008618 | Grad Max: 0.259520
  -> Layer: shared_layers.0.bias | Grad Mean: 0.467522 | Grad Max: 2.011271
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002220 | Grad Max: 0.007016
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007053 | Grad Max: 0.007053
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003117 | Grad Max: 0.318813
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057393 | Grad Max: 1.718957
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000530 | Grad Max: 0.015307
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028025 | Grad Max: 0.127041
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000787
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006325 | Grad Max: 0.013274
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000362
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001855 | Grad Max: 0.004194
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002126 | Grad Max: 0.004490
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041464 | Grad Max: 0.041464
[GRADIENT NORM TOTAL] 9.1746

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.637
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004218  0.49957818] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 668/1380 | B: 556/1492 | C: 281/1767
[LOSS Ex1] A: 0.65946 | B: 0.65242 | C: 0.64670
[LOGITS Ex2 A] Mean Abs: 1.802 | Max: 5.809
[LOSS Ex2] A: 0.15919 | B: 0.38009 | C: 0.29885
** [JOINT LOSS] ** : 0.932238
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003148 | Grad Max: 0.096211
  -> Layer: shared_layers.0.bias | Grad Mean: 0.264672 | Grad Max: 1.238688
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.006571
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008529 | Grad Max: 0.008529
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001671 | Grad Max: 0.182948
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031453 | Grad Max: 1.012811
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.010566
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015594 | Grad Max: 0.084175
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000449
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003382 | Grad Max: 0.007113
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000206
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000977 | Grad Max: 0.002266
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001057 | Grad Max: 0.002720
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021313 | Grad Max: 0.021313
[GRADIENT NORM TOTAL] 5.3818

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.421
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6413189  0.35868105] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 654/1394 | B: 568/1480 | C: 290/1758
[LOSS Ex1] A: 0.65640 | B: 0.65317 | C: 0.64835
[LOGITS Ex2 A] Mean Abs: 1.751 | Max: 6.676
[LOSS Ex2] A: 0.16747 | B: 0.38758 | C: 0.31519
** [JOINT LOSS] ** : 0.942723
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004757 | Grad Max: 0.111824
  -> Layer: shared_layers.0.bias | Grad Mean: 0.286383 | Grad Max: 1.323589
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.006326
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000307 | Grad Max: 0.000307
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001871 | Grad Max: 0.225813
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034297 | Grad Max: 1.282915
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000328 | Grad Max: 0.010624
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017437 | Grad Max: 0.086837
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000519
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003932 | Grad Max: 0.008253
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000252
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001175 | Grad Max: 0.002902
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001446 | Grad Max: 0.002853
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027807 | Grad Max: 0.027807
[GRADIENT NORM TOTAL] 5.4802

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.496
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.58924496 0.41075504] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 538/1078 | B: 575/1473 | C: 286/1762
[LOSS Ex1] A: 0.65497 | B: 0.64946 | C: 0.64767
[LOGITS Ex2 A] Mean Abs: 1.791 | Max: 5.637
[LOSS Ex2] A: 0.16764 | B: 0.38130 | C: 0.27678
** [JOINT LOSS] ** : 0.925942
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006263 | Grad Max: 0.158317
  -> Layer: shared_layers.0.bias | Grad Mean: 0.411033 | Grad Max: 1.627846
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002223 | Grad Max: 0.006615
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005839 | Grad Max: 0.005839
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002628 | Grad Max: 0.310749
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048895 | Grad Max: 1.737463
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.014014
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024374 | Grad Max: 0.114914
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000709
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005432 | Grad Max: 0.011512
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000330
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001575 | Grad Max: 0.003803
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001859 | Grad Max: 0.003437
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035272 | Grad Max: 0.035272
[GRADIENT NORM TOTAL] 7.8220

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.638
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5069125  0.49308747] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 677/1371 | B: 524/1332 | C: 312/1736
[LOSS Ex1] A: 0.65537 | B: 0.65310 | C: 0.64566
[LOGITS Ex2 A] Mean Abs: 1.786 | Max: 6.650
[LOSS Ex2] A: 0.16188 | B: 0.36475 | C: 0.27266
** [JOINT LOSS] ** : 0.917808
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.068718
  -> Layer: shared_layers.0.bias | Grad Mean: 0.140995 | Grad Max: 0.585823
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006144
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001208 | Grad Max: 0.001208
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000847 | Grad Max: 0.154997
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015458 | Grad Max: 0.853149
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000142 | Grad Max: 0.005216
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007542 | Grad Max: 0.035297
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000281
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001647 | Grad Max: 0.004478
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000132
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000453 | Grad Max: 0.001353
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000574 | Grad Max: 0.001818
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009366 | Grad Max: 0.009366
[GRADIENT NORM TOTAL] 2.7910

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.590
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51024634 0.48975363] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.051
[MASKS] A(Pass/Fail): 664/1384 | B: 556/1492 | C: 264/1784
[LOSS Ex1] A: 0.65316 | B: 0.65230 | C: 0.65041
[LOGITS Ex2 A] Mean Abs: 1.828 | Max: 5.745
[LOSS Ex2] A: 0.16897 | B: 0.38907 | C: 0.31134
** [JOINT LOSS] ** : 0.941751
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006583 | Grad Max: 0.154210
  -> Layer: shared_layers.0.bias | Grad Mean: 0.418553 | Grad Max: 1.799149
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002259 | Grad Max: 0.007284
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008040 | Grad Max: 0.008040
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002750 | Grad Max: 0.274588
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051154 | Grad Max: 1.453287
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000465 | Grad Max: 0.014269
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024813 | Grad Max: 0.116356
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000774
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005502 | Grad Max: 0.011484
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000360
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001608 | Grad Max: 0.003996
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001858 | Grad Max: 0.003888
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036099 | Grad Max: 0.036099
[GRADIENT NORM TOTAL] 8.2558

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.608
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023442 0.4976558] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 656/1392 | B: 568/1480 | C: 287/1761
[LOSS Ex1] A: 0.65217 | B: 0.65306 | C: 0.64652
[LOGITS Ex2 A] Mean Abs: 1.823 | Max: 6.511
[LOSS Ex2] A: 0.19002 | B: 0.39549 | C: 0.30622
** [JOINT LOSS] ** : 0.947825
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009009 | Grad Max: 0.240570
  -> Layer: shared_layers.0.bias | Grad Mean: 0.576189 | Grad Max: 2.478792
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002248 | Grad Max: 0.007141
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002237 | Grad Max: 0.002237
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003785 | Grad Max: 0.392867
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070356 | Grad Max: 2.148390
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000637 | Grad Max: 0.019486
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033876 | Grad Max: 0.161356
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000936
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007528 | Grad Max: 0.015541
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000450
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002173 | Grad Max: 0.005475
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002432 | Grad Max: 0.004430
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046523 | Grad Max: 0.046523
[GRADIENT NORM TOTAL] 11.4041

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.506
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50406975 0.49593022] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.050
[MASKS] A(Pass/Fail): 637/1411 | B: 575/1473 | C: 294/1754
[LOSS Ex1] A: 0.65881 | B: 0.64934 | C: 0.64572
[LOGITS Ex2 A] Mean Abs: 1.777 | Max: 5.626
[LOSS Ex2] A: 0.16761 | B: 0.36920 | C: 0.28800
** [JOINT LOSS] ** : 0.926227
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003383 | Grad Max: 0.095321
  -> Layer: shared_layers.0.bias | Grad Mean: 0.299511 | Grad Max: 1.153817
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006452
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007100 | Grad Max: 0.007100
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001854 | Grad Max: 0.227946
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034940 | Grad Max: 1.275959
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000328 | Grad Max: 0.012257
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017619 | Grad Max: 0.092840
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000544
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003823 | Grad Max: 0.008191
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000224
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001116 | Grad Max: 0.002566
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001185 | Grad Max: 0.002985
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024440 | Grad Max: 0.024440
[GRADIENT NORM TOTAL] 5.9179

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.435
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53632414 0.4636759 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.048
[MASKS] A(Pass/Fail): 627/1421 | B: 524/1332 | C: 254/1794
[LOSS Ex1] A: 0.65859 | B: 0.65299 | C: 0.65039
[LOGITS Ex2 A] Mean Abs: 1.721 | Max: 5.986
[LOSS Ex2] A: 0.17158 | B: 0.36675 | C: 0.32686
** [JOINT LOSS] ** : 0.942389
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004698 | Grad Max: 0.104231
  -> Layer: shared_layers.0.bias | Grad Mean: 0.308859 | Grad Max: 1.344249
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002000 | Grad Max: 0.006191
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005708 | Grad Max: 0.005708
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002013 | Grad Max: 0.253213
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037635 | Grad Max: 1.407548
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000338 | Grad Max: 0.010457
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018071 | Grad Max: 0.082488
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000531
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003997 | Grad Max: 0.008374
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000245
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001171 | Grad Max: 0.002757
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001439 | Grad Max: 0.003014
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027066 | Grad Max: 0.027066
[GRADIENT NORM TOTAL] 6.1985

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.572
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6750052  0.32499477] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.051
[MASKS] A(Pass/Fail): 693/1355 | B: 557/1491 | C: 294/1754
[LOSS Ex1] A: 0.65402 | B: 0.65219 | C: 0.64674
[LOGITS Ex2 A] Mean Abs: 1.759 | Max: 6.763
[LOSS Ex2] A: 0.16666 | B: 0.41137 | C: 0.28548
** [JOINT LOSS] ** : 0.938822
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005559 | Grad Max: 0.141036
  -> Layer: shared_layers.0.bias | Grad Mean: 0.415461 | Grad Max: 1.864884
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.006946
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007340 | Grad Max: 0.007340
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002588 | Grad Max: 0.366275
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048358 | Grad Max: 2.058126
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000424 | Grad Max: 0.013804
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022738 | Grad Max: 0.109250
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000669
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005014 | Grad Max: 0.010691
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000295
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001461 | Grad Max: 0.003393
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001664 | Grad Max: 0.003186
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032336 | Grad Max: 0.032336
[GRADIENT NORM TOTAL] 8.5341

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.641
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50043046 0.49956954] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 670/1378 | B: 569/1479 | C: 309/1739
[LOSS Ex1] A: 0.65928 | B: 0.65294 | C: 0.64347
[LOGITS Ex2 A] Mean Abs: 1.784 | Max: 6.119
[LOSS Ex2] A: 0.15558 | B: 0.39447 | C: 0.27713
** [JOINT LOSS] ** : 0.927628
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004024 | Grad Max: 0.125306
  -> Layer: shared_layers.0.bias | Grad Mean: 0.230756 | Grad Max: 1.155406
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.005999
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001877 | Grad Max: 0.001877
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001511 | Grad Max: 0.247612
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027624 | Grad Max: 1.390795
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000224 | Grad Max: 0.007282
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011994 | Grad Max: 0.059262
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000361
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002723 | Grad Max: 0.006421
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000195
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000805 | Grad Max: 0.002132
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000989 | Grad Max: 0.002334
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018445 | Grad Max: 0.018445
[GRADIENT NORM TOTAL] 4.9208

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.424
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64209425 0.35790578] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.051
[MASKS] A(Pass/Fail): 654/1394 | B: 575/1473 | C: 263/1785
[LOSS Ex1] A: 0.65620 | B: 0.64922 | C: 0.64938
[LOGITS Ex2 A] Mean Abs: 1.804 | Max: 6.039
[LOSS Ex2] A: 0.17263 | B: 0.36785 | C: 0.29416
** [JOINT LOSS] ** : 0.929816
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002927 | Grad Max: 0.095361
  -> Layer: shared_layers.0.bias | Grad Mean: 0.223487 | Grad Max: 1.177606
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006235
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000567 | Grad Max: 0.000567
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001488 | Grad Max: 0.173193
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027096 | Grad Max: 0.954938
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000255 | Grad Max: 0.008203
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013496 | Grad Max: 0.061773
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000451
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002987 | Grad Max: 0.007076
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000207
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000880 | Grad Max: 0.002188
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001054 | Grad Max: 0.002516
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020654 | Grad Max: 0.020654
[GRADIENT NORM TOTAL] 4.6808

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.499
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5896842  0.41031575] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 539/1077 | B: 524/1332 | C: 164/1212
[LOSS Ex1] A: 0.65477 | B: 0.65288 | C: 0.64996
[LOGITS Ex2 A] Mean Abs: 1.854 | Max: 6.277
[LOSS Ex2] A: 0.15483 | B: 0.36193 | C: 0.29849
** [JOINT LOSS] ** : 0.924284
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003692 | Grad Max: 0.100178
  -> Layer: shared_layers.0.bias | Grad Mean: 0.282581 | Grad Max: 1.260493
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.006505
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006290 | Grad Max: 0.006290
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001892 | Grad Max: 0.178317
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035321 | Grad Max: 0.989205
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000317 | Grad Max: 0.011064
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017034 | Grad Max: 0.089723
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000552
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003744 | Grad Max: 0.008659
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000225
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001080 | Grad Max: 0.002525
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001160 | Grad Max: 0.002980
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023443 | Grad Max: 0.023443
[GRADIENT NORM TOTAL] 5.7097

[EPOCH SUMMARY] Train Loss: 0.9318

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9104 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9143 -> New: 0.9104)

############################## EPOCH 91/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.642
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50693864 0.49306136] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 677/1371 | B: 557/1491 | C: 271/1777
[LOSS Ex1] A: 0.65517 | B: 0.65207 | C: 0.64890
[LOGITS Ex2 A] Mean Abs: 1.804 | Max: 6.409
[LOSS Ex2] A: 0.15922 | B: 0.38517 | C: 0.29861
** [JOINT LOSS] ** : 0.933049
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001850 | Grad Max: 0.052655
  -> Layer: shared_layers.0.bias | Grad Mean: 0.037736 | Grad Max: 0.142953
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.006195
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000490 | Grad Max: 0.000490
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000413 | Grad Max: 0.133576
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.006879 | Grad Max: 0.732717
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000040 | Grad Max: 0.002450
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001488 | Grad Max: 0.012846
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000153
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000265 | Grad Max: 0.001622
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000055
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000075 | Grad Max: 0.000469
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000334 | Grad Max: 0.000973
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000040 | Grad Max: 0.000040
[GRADIENT NORM TOTAL] 1.3367

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.593
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51026    0.48973998] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 664/1384 | B: 569/1479 | C: 275/1773
[LOSS Ex1] A: 0.65294 | B: 0.65282 | C: 0.64895
[LOGITS Ex2 A] Mean Abs: 1.782 | Max: 6.107
[LOSS Ex2] A: 0.16175 | B: 0.39879 | C: 0.29624
** [JOINT LOSS] ** : 0.937161
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003027 | Grad Max: 0.099156
  -> Layer: shared_layers.0.bias | Grad Mean: 0.312068 | Grad Max: 1.365302
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002239 | Grad Max: 0.007160
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006480 | Grad Max: 0.006480
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001952 | Grad Max: 0.257526
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036483 | Grad Max: 1.436378
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.010856
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018080 | Grad Max: 0.079522
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000559
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003968 | Grad Max: 0.008357
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000251
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001160 | Grad Max: 0.002909
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001335 | Grad Max: 0.002562
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025859 | Grad Max: 0.025859
[GRADIENT NORM TOTAL] 6.1694

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.612
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023902  0.49760976] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 656/1392 | B: 575/1473 | C: 252/1796
[LOSS Ex1] A: 0.65195 | B: 0.64908 | C: 0.65014
[LOGITS Ex2 A] Mean Abs: 1.773 | Max: 6.965
[LOSS Ex2] A: 0.17945 | B: 0.37247 | C: 0.27531
** [JOINT LOSS] ** : 0.926132
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003416 | Grad Max: 0.083281
  -> Layer: shared_layers.0.bias | Grad Mean: 0.253905 | Grad Max: 1.034825
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002259 | Grad Max: 0.006862
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000903 | Grad Max: 0.000903
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001508 | Grad Max: 0.209436
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028003 | Grad Max: 1.151640
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.011137
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014826 | Grad Max: 0.092071
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000473
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003227 | Grad Max: 0.007158
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000213
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000930 | Grad Max: 0.002288
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001076 | Grad Max: 0.002455
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020562 | Grad Max: 0.020562
[GRADIENT NORM TOTAL] 4.6939

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.509
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.504078 0.495922] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.050
[MASKS] A(Pass/Fail): 637/1411 | B: 527/1329 | C: 272/1776
[LOSS Ex1] A: 0.65861 | B: 0.65274 | C: 0.64887
[LOGITS Ex2 A] Mean Abs: 1.769 | Max: 5.525
[LOSS Ex2] A: 0.15993 | B: 0.36659 | C: 0.28308
** [JOINT LOSS] ** : 0.923274
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003158 | Grad Max: 0.078901
  -> Layer: shared_layers.0.bias | Grad Mean: 0.189416 | Grad Max: 0.906628
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.005657
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001132 | Grad Max: 0.001132
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001206 | Grad Max: 0.170423
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021879 | Grad Max: 0.957758
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000173 | Grad Max: 0.005595
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009231 | Grad Max: 0.043727
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000345
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002070 | Grad Max: 0.004872
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000138
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000608 | Grad Max: 0.001519
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000676 | Grad Max: 0.002159
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013920 | Grad Max: 0.013920
[GRADIENT NORM TOTAL] 4.0012

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.438
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53646034 0.4635397 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.540 | Std: 0.049
[MASKS] A(Pass/Fail): 628/1420 | B: 559/1489 | C: 290/1758
[LOSS Ex1] A: 0.65839 | B: 0.65193 | C: 0.64660
[LOGITS Ex2 A] Mean Abs: 1.757 | Max: 6.197
[LOSS Ex2] A: 0.17774 | B: 0.38329 | C: 0.30673
** [JOINT LOSS] ** : 0.941560
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004294 | Grad Max: 0.112279
  -> Layer: shared_layers.0.bias | Grad Mean: 0.266477 | Grad Max: 1.223575
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005916
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004871 | Grad Max: 0.004871
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001695 | Grad Max: 0.223539
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031362 | Grad Max: 1.162058
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000276 | Grad Max: 0.010015
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014797 | Grad Max: 0.069218
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000427
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003203 | Grad Max: 0.006955
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000215
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000923 | Grad Max: 0.002328
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001004 | Grad Max: 0.002308
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019795 | Grad Max: 0.019795
[GRADIENT NORM TOTAL] 5.4068

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.575
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6761805 0.3238195] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.051
[MASKS] A(Pass/Fail): 693/1355 | B: 570/1478 | C: 304/1744
[LOSS Ex1] A: 0.65380 | B: 0.65268 | C: 0.64450
[LOGITS Ex2 A] Mean Abs: 1.789 | Max: 5.836
[LOSS Ex2] A: 0.15463 | B: 0.38336 | C: 0.28121
** [JOINT LOSS] ** : 0.923390
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001705 | Grad Max: 0.038782
  -> Layer: shared_layers.0.bias | Grad Mean: 0.054890 | Grad Max: 0.222208
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002267 | Grad Max: 0.007106
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006481 | Grad Max: 0.006481
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000422 | Grad Max: 0.192756
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007135 | Grad Max: 1.086710
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000040 | Grad Max: 0.002700
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001369 | Grad Max: 0.014025
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000136
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000243 | Grad Max: 0.001715
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000059
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000079 | Grad Max: 0.000490
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000482 | Grad Max: 0.001126
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001452 | Grad Max: 0.001452
[GRADIENT NORM TOTAL] 1.8972

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.645
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004171 0.4995829] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.050
[MASKS] A(Pass/Fail): 671/1377 | B: 575/1473 | C: 286/1762
[LOSS Ex1] A: 0.65907 | B: 0.64892 | C: 0.64817
[LOGITS Ex2 A] Mean Abs: 1.777 | Max: 5.779
[LOSS Ex2] A: 0.15438 | B: 0.36510 | C: 0.29718
** [JOINT LOSS] ** : 0.924274
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005088 | Grad Max: 0.148960
  -> Layer: shared_layers.0.bias | Grad Mean: 0.208430 | Grad Max: 0.969832
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.006052
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004571 | Grad Max: 0.004571
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001434 | Grad Max: 0.235267
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026066 | Grad Max: 1.312161
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.007037
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012094 | Grad Max: 0.056557
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000465
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002757 | Grad Max: 0.006749
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000190
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000804 | Grad Max: 0.001972
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001010 | Grad Max: 0.002513
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018225 | Grad Max: 0.018225
[GRADIENT NORM TOTAL] 4.3957

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.427
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6430642  0.35693574] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.051
[MASKS] A(Pass/Fail): 654/1394 | B: 528/1328 | C: 310/1738
[LOSS Ex1] A: 0.65596 | B: 0.65259 | C: 0.64428
[LOGITS Ex2 A] Mean Abs: 1.794 | Max: 6.331
[LOSS Ex2] A: 0.17281 | B: 0.35883 | C: 0.27963
** [JOINT LOSS] ** : 0.921369
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004037 | Grad Max: 0.130079
  -> Layer: shared_layers.0.bias | Grad Mean: 0.076718 | Grad Max: 0.376148
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.006346
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000514 | Grad Max: 0.000514
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000649 | Grad Max: 0.081457
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010855 | Grad Max: 0.423184
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.003305
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003916 | Grad Max: 0.023704
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000249
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000925 | Grad Max: 0.003039
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000097
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000257 | Grad Max: 0.000851
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000460 | Grad Max: 0.001651
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005073 | Grad Max: 0.005073
[GRADIENT NORM TOTAL] 1.7215

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.503
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.590244   0.40975603] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.051
[MASKS] A(Pass/Fail): 541/1075 | B: 561/1487 | C: 319/1729
[LOSS Ex1] A: 0.65450 | B: 0.65176 | C: 0.64188
[LOGITS Ex2 A] Mean Abs: 1.878 | Max: 6.110
[LOSS Ex2] A: 0.15217 | B: 0.38676 | C: 0.28578
** [JOINT LOSS] ** : 0.924281
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004083 | Grad Max: 0.121803
  -> Layer: shared_layers.0.bias | Grad Mean: 0.325477 | Grad Max: 1.489616
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.006966
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006330 | Grad Max: 0.006330
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002053 | Grad Max: 0.199119
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038172 | Grad Max: 1.100857
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000340 | Grad Max: 0.009705
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018438 | Grad Max: 0.078121
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000499
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004049 | Grad Max: 0.008848
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000226
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001168 | Grad Max: 0.002572
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001226 | Grad Max: 0.002748
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025054 | Grad Max: 0.025054
[GRADIENT NORM TOTAL] 6.2687

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.646
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5069767  0.49302328] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.052
[MASKS] A(Pass/Fail): 677/1371 | B: 570/1478 | C: 285/1763
[LOSS Ex1] A: 0.65490 | B: 0.65250 | C: 0.64782
[LOGITS Ex2 A] Mean Abs: 1.850 | Max: 6.611
[LOSS Ex2] A: 0.16891 | B: 0.39186 | C: 0.28289
** [JOINT LOSS] ** : 0.932960
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006082 | Grad Max: 0.186873
  -> Layer: shared_layers.0.bias | Grad Mean: 0.382505 | Grad Max: 1.673456
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.006709
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006682 | Grad Max: 0.006682
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002528 | Grad Max: 0.270969
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046557 | Grad Max: 1.453599
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000428 | Grad Max: 0.012188
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022943 | Grad Max: 0.100090
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000630
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005095 | Grad Max: 0.010955
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000310
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001472 | Grad Max: 0.003715
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001636 | Grad Max: 0.003456
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031935 | Grad Max: 0.031935
[GRADIENT NORM TOTAL] 7.5072

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.598
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5102018 0.4897982] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 665/1383 | B: 575/1473 | C: 278/1770
[LOSS Ex1] A: 0.65265 | B: 0.64874 | C: 0.64832
[LOGITS Ex2 A] Mean Abs: 1.822 | Max: 5.656
[LOSS Ex2] A: 0.16123 | B: 0.36714 | C: 0.29864
** [JOINT LOSS] ** : 0.925572
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004957 | Grad Max: 0.180381
  -> Layer: shared_layers.0.bias | Grad Mean: 0.077351 | Grad Max: 0.366577
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006851
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004600 | Grad Max: 0.004600
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000708 | Grad Max: 0.097931
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010861 | Grad Max: 0.542438
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.003288
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002936 | Grad Max: 0.021758
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000212
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000779 | Grad Max: 0.002827
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000101
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000236 | Grad Max: 0.000754
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000406 | Grad Max: 0.001332
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005600 | Grad Max: 0.005600
[GRADIENT NORM TOTAL] 1.8627

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.617
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50247395 0.49752602] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 657/1391 | B: 530/1326 | C: 290/1758
[LOSS Ex1] A: 0.65164 | B: 0.65240 | C: 0.64786
[LOGITS Ex2 A] Mean Abs: 1.763 | Max: 6.968
[LOSS Ex2] A: 0.18579 | B: 0.38944 | C: 0.29523
** [JOINT LOSS] ** : 0.940790
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005287 | Grad Max: 0.158571
  -> Layer: shared_layers.0.bias | Grad Mean: 0.492332 | Grad Max: 2.257746
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.007609
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007978 | Grad Max: 0.007978
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003027 | Grad Max: 0.337989
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056935 | Grad Max: 1.897598
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000519 | Grad Max: 0.017684
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028103 | Grad Max: 0.148489
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000730
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006093 | Grad Max: 0.012983
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000369
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001740 | Grad Max: 0.004411
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001915 | Grad Max: 0.003449
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037953 | Grad Max: 0.037953
[GRADIENT NORM TOTAL] 9.6642

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.513
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.504084 0.495916] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.050
[MASKS] A(Pass/Fail): 637/1411 | B: 562/1486 | C: 304/1744
[LOSS Ex1] A: 0.65834 | B: 0.65157 | C: 0.64317
[LOGITS Ex2 A] Mean Abs: 1.755 | Max: 6.152
[LOSS Ex2] A: 0.16453 | B: 0.41998 | C: 0.30292
** [JOINT LOSS] ** : 0.946832
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009461 | Grad Max: 0.215220
  -> Layer: shared_layers.0.bias | Grad Mean: 0.620826 | Grad Max: 2.662034
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.006122
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005347 | Grad Max: 0.005347
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004053 | Grad Max: 0.421919
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.076124 | Grad Max: 2.323021
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000708 | Grad Max: 0.020869
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.038181 | Grad Max: 0.178900
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001029
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008384 | Grad Max: 0.016848
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000477
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002410 | Grad Max: 0.005349
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002842 | Grad Max: 0.005225
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.053375 | Grad Max: 0.053375
[GRADIENT NORM TOTAL] 12.1599

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.443
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5366132  0.46338683] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 629/1419 | B: 570/1478 | C: 176/1200
[LOSS Ex1] A: 0.65811 | B: 0.65231 | C: 0.64940
[LOGITS Ex2 A] Mean Abs: 1.744 | Max: 6.051
[LOSS Ex2] A: 0.16804 | B: 0.39112 | C: 0.27477
** [JOINT LOSS] ** : 0.931255
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005803 | Grad Max: 0.148400
  -> Layer: shared_layers.0.bias | Grad Mean: 0.312755 | Grad Max: 1.324983
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.005904
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000446 | Grad Max: 0.000446
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.241350
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038718 | Grad Max: 1.353927
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.010835
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019171 | Grad Max: 0.087482
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000547
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004273 | Grad Max: 0.009307
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000266
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001233 | Grad Max: 0.003224
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001470 | Grad Max: 0.003140
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027220 | Grad Max: 0.027220
[GRADIENT NORM TOTAL] 6.0220

[EPOCH SUMMARY] Train Loss: 0.9308

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9156 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 92/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.579
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.67777485 0.32222512] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.052
[MASKS] A(Pass/Fail): 693/1355 | B: 574/1474 | C: 286/1762
[LOSS Ex1] A: 0.65349 | B: 0.64854 | C: 0.64865
[LOGITS Ex2 A] Mean Abs: 1.859 | Max: 6.006
[LOSS Ex2] A: 0.15514 | B: 0.36985 | C: 0.28355
** [JOINT LOSS] ** : 0.919742
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003667 | Grad Max: 0.128812
  -> Layer: shared_layers.0.bias | Grad Mean: 0.339979 | Grad Max: 1.608904
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002284 | Grad Max: 0.006945
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009336 | Grad Max: 0.009336
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.239315
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040262 | Grad Max: 1.332754
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000374 | Grad Max: 0.014221
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020307 | Grad Max: 0.117879
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000615
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004426 | Grad Max: 0.009548
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000254
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001271 | Grad Max: 0.002977
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001456 | Grad Max: 0.003539
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028453 | Grad Max: 0.028453
[GRADIENT NORM TOTAL] 6.9628

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.649
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003976  0.49960235] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 673/1375 | B: 530/1326 | C: 272/1776
[LOSS Ex1] A: 0.65880 | B: 0.65222 | C: 0.64765
[LOGITS Ex2 A] Mean Abs: 1.893 | Max: 5.572
[LOSS Ex2] A: 0.15918 | B: 0.37306 | C: 0.31210
** [JOINT LOSS] ** : 0.934336
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005082 | Grad Max: 0.191642
  -> Layer: shared_layers.0.bias | Grad Mean: 0.561436 | Grad Max: 2.533338
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.006198
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007091 | Grad Max: 0.007091
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003437 | Grad Max: 0.385297
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064882 | Grad Max: 2.161893
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000606 | Grad Max: 0.022189
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.032980 | Grad Max: 0.187524
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000888
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007061 | Grad Max: 0.014494
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000385
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002030 | Grad Max: 0.004572
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002262 | Grad Max: 0.004205
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044619 | Grad Max: 0.044619
[GRADIENT NORM TOTAL] 11.2940

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.432
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64423674 0.35576323] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.051
[MASKS] A(Pass/Fail): 654/1394 | B: 561/1487 | C: 290/1758
[LOSS Ex1] A: 0.65567 | B: 0.65140 | C: 0.64852
[LOGITS Ex2 A] Mean Abs: 1.876 | Max: 6.147
[LOSS Ex2] A: 0.17275 | B: 0.39255 | C: 0.29798
** [JOINT LOSS] ** : 0.939618
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004016 | Grad Max: 0.128612
  -> Layer: shared_layers.0.bias | Grad Mean: 0.381852 | Grad Max: 1.698839
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.006175
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005037 | Grad Max: 0.005037
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002358 | Grad Max: 0.281142
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043765 | Grad Max: 1.550687
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.015715
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022288 | Grad Max: 0.129662
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000553
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004754 | Grad Max: 0.009547
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000267
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001371 | Grad Max: 0.003123
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001548 | Grad Max: 0.003109
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030472 | Grad Max: 0.030472
[GRADIENT NORM TOTAL] 7.7432

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.507
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59096104 0.409039  ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.052
[MASKS] A(Pass/Fail): 542/1074 | B: 568/1480 | C: 289/1759
[LOSS Ex1] A: 0.65422 | B: 0.65215 | C: 0.64743
[LOGITS Ex2 A] Mean Abs: 1.884 | Max: 5.802
[LOSS Ex2] A: 0.16582 | B: 0.38353 | C: 0.30158
** [JOINT LOSS] ** : 0.934910
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004277 | Grad Max: 0.153335
  -> Layer: shared_layers.0.bias | Grad Mean: 0.069341 | Grad Max: 0.386718
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.006357
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004784 | Grad Max: 0.004784
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000710 | Grad Max: 0.163362
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011447 | Grad Max: 0.919861
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.003606
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003537 | Grad Max: 0.026139
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000184
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000886 | Grad Max: 0.002659
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000097
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000284 | Grad Max: 0.000851
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000535 | Grad Max: 0.001459
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008038 | Grad Max: 0.008038
[GRADIENT NORM TOTAL] 2.0138

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.651
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5069954  0.49300456] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.544 | Std: 0.052
[MASKS] A(Pass/Fail): 678/1370 | B: 574/1474 | C: 285/1763
[LOSS Ex1] A: 0.65462 | B: 0.64838 | C: 0.64502
[LOGITS Ex2 A] Mean Abs: 1.845 | Max: 5.966
[LOSS Ex2] A: 0.15724 | B: 0.37078 | C: 0.29139
** [JOINT LOSS] ** : 0.922477
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005197 | Grad Max: 0.154039
  -> Layer: shared_layers.0.bias | Grad Mean: 0.281595 | Grad Max: 1.280005
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002301 | Grad Max: 0.006672
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002254 | Grad Max: 0.002254
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001869 | Grad Max: 0.270828
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034429 | Grad Max: 1.520162
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000317 | Grad Max: 0.009581
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017041 | Grad Max: 0.085095
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000501
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003780 | Grad Max: 0.008595
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000243
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001088 | Grad Max: 0.002437
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001298 | Grad Max: 0.002644
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024479 | Grad Max: 0.024479
[GRADIENT NORM TOTAL] 5.6071

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.603
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5102565  0.48974347] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.052
[MASKS] A(Pass/Fail): 665/1383 | B: 530/1326 | C: 288/1760
[LOSS Ex1] A: 0.65236 | B: 0.65207 | C: 0.64421
[LOGITS Ex2 A] Mean Abs: 1.846 | Max: 6.658
[LOSS Ex2] A: 0.15899 | B: 0.35861 | C: 0.27957
** [JOINT LOSS] ** : 0.915270
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002564 | Grad Max: 0.072754
  -> Layer: shared_layers.0.bias | Grad Mean: 0.124360 | Grad Max: 0.646574
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002304 | Grad Max: 0.007165
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007611 | Grad Max: 0.007611
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000850 | Grad Max: 0.278419
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014817 | Grad Max: 1.565183
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000092 | Grad Max: 0.004689
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004523 | Grad Max: 0.033185
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000200
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000872 | Grad Max: 0.003131
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000092
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000239 | Grad Max: 0.000838
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000368 | Grad Max: 0.001223
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004972 | Grad Max: 0.004972
[GRADIENT NORM TOTAL] 3.0980

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.622
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5024591  0.49754086] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 658/1390 | B: 561/1487 | C: 299/1749
[LOSS Ex1] A: 0.65136 | B: 0.65124 | C: 0.64648
[LOGITS Ex2 A] Mean Abs: 1.865 | Max: 7.097
[LOSS Ex2] A: 0.18188 | B: 0.38529 | C: 0.28976
** [JOINT LOSS] ** : 0.935338
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007570 | Grad Max: 0.294624
  -> Layer: shared_layers.0.bias | Grad Mean: 0.268254 | Grad Max: 1.196448
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.006576
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002533 | Grad Max: 0.002533
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001990 | Grad Max: 0.188115
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035470 | Grad Max: 0.967117
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000328 | Grad Max: 0.009353
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017001 | Grad Max: 0.073239
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000538
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003868 | Grad Max: 0.008324
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000279
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001105 | Grad Max: 0.002830
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001226 | Grad Max: 0.002574
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022825 | Grad Max: 0.022825
[GRADIENT NORM TOTAL] 5.1843

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.517
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040335 0.4959665] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.541 | Std: 0.051
[MASKS] A(Pass/Fail): 639/1409 | B: 568/1480 | C: 275/1773
[LOSS Ex1] A: 0.65809 | B: 0.65199 | C: 0.64710
[LOGITS Ex2 A] Mean Abs: 1.830 | Max: 6.043
[LOSS Ex2] A: 0.16440 | B: 0.38289 | C: 0.27157
** [JOINT LOSS] ** : 0.925346
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004814 | Grad Max: 0.142152
  -> Layer: shared_layers.0.bias | Grad Mean: 0.165591 | Grad Max: 0.753873
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.005676
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005020 | Grad Max: 0.005020
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001202 | Grad Max: 0.180513
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021584 | Grad Max: 0.934202
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000175 | Grad Max: 0.004630
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009056 | Grad Max: 0.041225
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000325
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002073 | Grad Max: 0.004968
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000154
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000598 | Grad Max: 0.001504
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000691 | Grad Max: 0.001983
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012819 | Grad Max: 0.012819
[GRADIENT NORM TOTAL] 3.5458

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.447
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5368057  0.46319425] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.049
[MASKS] A(Pass/Fail): 629/1419 | B: 574/1474 | C: 316/1732
[LOSS Ex1] A: 0.65787 | B: 0.64821 | C: 0.64249
[LOGITS Ex2 A] Mean Abs: 1.767 | Max: 6.467
[LOSS Ex2] A: 0.16605 | B: 0.36560 | C: 0.27598
** [JOINT LOSS] ** : 0.918730
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002956 | Grad Max: 0.066338
  -> Layer: shared_layers.0.bias | Grad Mean: 0.211834 | Grad Max: 0.774864
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002243 | Grad Max: 0.006985
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008664 | Grad Max: 0.008664
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001353 | Grad Max: 0.228826
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025063 | Grad Max: 1.297519
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000234 | Grad Max: 0.007038
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012679 | Grad Max: 0.064054
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000397
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002790 | Grad Max: 0.006172
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000179
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000804 | Grad Max: 0.002052
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000912 | Grad Max: 0.002290
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017517 | Grad Max: 0.017517
[GRADIENT NORM TOTAL] 4.2809

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.584
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.679265   0.32073498] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.052
[MASKS] A(Pass/Fail): 694/1354 | B: 530/1326 | C: 293/1755
[LOSS Ex1] A: 0.65322 | B: 0.65191 | C: 0.64935
[LOGITS Ex2 A] Mean Abs: 1.821 | Max: 6.047
[LOSS Ex2] A: 0.15875 | B: 0.37029 | C: 0.31864
** [JOINT LOSS] ** : 0.934051
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002368 | Grad Max: 0.078193
  -> Layer: shared_layers.0.bias | Grad Mean: 0.253054 | Grad Max: 1.076362
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.006888
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007903 | Grad Max: 0.007903
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001464 | Grad Max: 0.230275
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027170 | Grad Max: 1.291071
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.009161
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013900 | Grad Max: 0.067334
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000418
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003021 | Grad Max: 0.006504
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000242
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000869 | Grad Max: 0.002275
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000981 | Grad Max: 0.002050
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019146 | Grad Max: 0.019146
[GRADIENT NORM TOTAL] 4.8876

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.655
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50040823 0.49959177] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.051
[MASKS] A(Pass/Fail): 677/1371 | B: 564/1484 | C: 272/1776
[LOSS Ex1] A: 0.65855 | B: 0.65108 | C: 0.64695
[LOGITS Ex2 A] Mean Abs: 1.851 | Max: 5.614
[LOSS Ex2] A: 0.15595 | B: 0.38089 | C: 0.27419
** [JOINT LOSS] ** : 0.922538
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002971 | Grad Max: 0.099688
  -> Layer: shared_layers.0.bias | Grad Mean: 0.119334 | Grad Max: 0.558446
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.006445
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006482 | Grad Max: 0.006482
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000883 | Grad Max: 0.098125
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015048 | Grad Max: 0.537029
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.006388
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005963 | Grad Max: 0.041264
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000276
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001152 | Grad Max: 0.003678
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000092
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000327 | Grad Max: 0.000922
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000464 | Grad Max: 0.001588
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006809 | Grad Max: 0.006809
[GRADIENT NORM TOTAL] 2.5326

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.436
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64538574 0.3546142 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.052
[MASKS] A(Pass/Fail): 658/1390 | B: 568/1480 | C: 290/1758
[LOSS Ex1] A: 0.65539 | B: 0.65183 | C: 0.64849
[LOGITS Ex2 A] Mean Abs: 1.838 | Max: 5.875
[LOSS Ex2] A: 0.17367 | B: 0.37886 | C: 0.30794
** [JOINT LOSS] ** : 0.938729
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.068216
  -> Layer: shared_layers.0.bias | Grad Mean: 0.157219 | Grad Max: 0.888710
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002088 | Grad Max: 0.006228
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001132 | Grad Max: 0.001132
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001063 | Grad Max: 0.160097
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018832 | Grad Max: 0.895306
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000146 | Grad Max: 0.005882
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007734 | Grad Max: 0.045255
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000247
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001598 | Grad Max: 0.004417
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000110
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000446 | Grad Max: 0.001160
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000419 | Grad Max: 0.001335
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008590 | Grad Max: 0.008590
[GRADIENT NORM TOTAL] 3.4145

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.511
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5916473 0.4083527] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.052
[MASKS] A(Pass/Fail): 544/1072 | B: 574/1474 | C: 292/1756
[LOSS Ex1] A: 0.65393 | B: 0.64803 | C: 0.64540
[LOGITS Ex2 A] Mean Abs: 1.863 | Max: 5.853
[LOSS Ex2] A: 0.15106 | B: 0.36794 | C: 0.28996
** [JOINT LOSS] ** : 0.918773
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004178 | Grad Max: 0.114518
  -> Layer: shared_layers.0.bias | Grad Mean: 0.169060 | Grad Max: 0.893915
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002279 | Grad Max: 0.006544
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005485 | Grad Max: 0.005485
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001213 | Grad Max: 0.192588
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022316 | Grad Max: 1.094017
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000202 | Grad Max: 0.006237
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010760 | Grad Max: 0.048084
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000353
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002395 | Grad Max: 0.005685
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000177
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000685 | Grad Max: 0.001709
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000867 | Grad Max: 0.001996
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015484 | Grad Max: 0.015484
[GRADIENT NORM TOTAL] 3.5422

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.656
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50704396 0.4929561 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.544 | Std: 0.052
[MASKS] A(Pass/Fail): 678/1370 | B: 530/1326 | C: 221/1155
[LOSS Ex1] A: 0.65434 | B: 0.65174 | C: 0.63927
[LOGITS Ex2 A] Mean Abs: 1.838 | Max: 6.722
[LOSS Ex2] A: 0.16645 | B: 0.35847 | C: 0.28398
** [JOINT LOSS] ** : 0.918083
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001895 | Grad Max: 0.069143
  -> Layer: shared_layers.0.bias | Grad Mean: 0.053644 | Grad Max: 0.232167
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002274 | Grad Max: 0.006422
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003359 | Grad Max: 0.003359
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000482 | Grad Max: 0.105066
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007950 | Grad Max: 0.584918
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.002901
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002437 | Grad Max: 0.023816
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000222
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000530 | Grad Max: 0.002255
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000160 | Grad Max: 0.000595
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.001369
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003989 | Grad Max: 0.003989
[GRADIENT NORM TOTAL] 1.4640

[EPOCH SUMMARY] Train Loss: 0.9270

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9066 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9104 -> New: 0.9066)

############################## EPOCH 93/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.608
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5102843  0.48971567] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.544 | Std: 0.052
[MASKS] A(Pass/Fail): 665/1383 | B: 566/1482 | C: 313/1735
[LOSS Ex1] A: 0.65205 | B: 0.65090 | C: 0.64309
[LOGITS Ex2 A] Mean Abs: 1.843 | Max: 6.004
[LOSS Ex2] A: 0.15882 | B: 0.38725 | C: 0.27441
** [JOINT LOSS] ** : 0.922175
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003158 | Grad Max: 0.103675
  -> Layer: shared_layers.0.bias | Grad Mean: 0.079199 | Grad Max: 0.327341
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.006802
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003918 | Grad Max: 0.003918
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000581 | Grad Max: 0.167401
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009151 | Grad Max: 0.900169
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.003589
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001999 | Grad Max: 0.018306
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000163
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000322 | Grad Max: 0.001739
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000061
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000091 | Grad Max: 0.000553
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000283 | Grad Max: 0.000940
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001749 | Grad Max: 0.001749
[GRADIENT NORM TOTAL] 2.0753

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.627
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50248706 0.497513  ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.052
[MASKS] A(Pass/Fail): 661/1387 | B: 568/1480 | C: 298/1750
[LOSS Ex1] A: 0.65104 | B: 0.65164 | C: 0.64534
[LOGITS Ex2 A] Mean Abs: 1.847 | Max: 6.486
[LOSS Ex2] A: 0.17422 | B: 0.38445 | C: 0.28379
** [JOINT LOSS] ** : 0.930159
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003805 | Grad Max: 0.176372
  -> Layer: shared_layers.0.bias | Grad Mean: 0.086100 | Grad Max: 0.330938
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.007153
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004991 | Grad Max: 0.004991
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000788 | Grad Max: 0.277500
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012200 | Grad Max: 1.553403
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.003341
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003173 | Grad Max: 0.020364
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000210
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000844 | Grad Max: 0.002746
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000097
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000257 | Grad Max: 0.000822
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000321 | Grad Max: 0.001186
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005605 | Grad Max: 0.005605
[GRADIENT NORM TOTAL] 2.6806

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.521
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5039947  0.49600533] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.051
[MASKS] A(Pass/Fail): 643/1405 | B: 575/1473 | C: 338/1710
[LOSS Ex1] A: 0.65781 | B: 0.64783 | C: 0.64287
[LOGITS Ex2 A] Mean Abs: 1.812 | Max: 7.042
[LOSS Ex2] A: 0.15903 | B: 0.36646 | C: 0.28254
** [JOINT LOSS] ** : 0.918850
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.050671
  -> Layer: shared_layers.0.bias | Grad Mean: 0.068912 | Grad Max: 0.328047
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006131
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003249 | Grad Max: 0.003249
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000542 | Grad Max: 0.206713
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009309 | Grad Max: 1.158949
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.003000
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002546 | Grad Max: 0.022085
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000146
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000507 | Grad Max: 0.002246
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000143 | Grad Max: 0.000648
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000391 | Grad Max: 0.001146
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002982 | Grad Max: 0.002982
[GRADIENT NORM TOTAL] 2.1317

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.451
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53708684 0.46291316] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.541 | Std: 0.050
[MASKS] A(Pass/Fail): 634/1414 | B: 530/1326 | C: 309/1739
[LOSS Ex1] A: 0.65758 | B: 0.65153 | C: 0.64526
[LOGITS Ex2 A] Mean Abs: 1.820 | Max: 6.467
[LOSS Ex2] A: 0.17223 | B: 0.36018 | C: 0.29376
** [JOINT LOSS] ** : 0.926847
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003066 | Grad Max: 0.099813
  -> Layer: shared_layers.0.bias | Grad Mean: 0.076162 | Grad Max: 0.292323
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.006115
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003572 | Grad Max: 0.003572
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000628 | Grad Max: 0.102752
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010110 | Grad Max: 0.566351
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.004246
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002133 | Grad Max: 0.021278
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000119
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000307 | Grad Max: 0.001841
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000054
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000086 | Grad Max: 0.000566
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000217 | Grad Max: 0.000945
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001604 | Grad Max: 0.001604
[GRADIENT NORM TOTAL] 1.8266

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.588
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6811671  0.31883287] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.053
[MASKS] A(Pass/Fail): 695/1353 | B: 568/1480 | C: 297/1751
[LOSS Ex1] A: 0.65288 | B: 0.65068 | C: 0.64673
[LOGITS Ex2 A] Mean Abs: 1.861 | Max: 5.963
[LOSS Ex2] A: 0.16112 | B: 0.39143 | C: 0.29886
** [JOINT LOSS] ** : 0.933894
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.040965
  -> Layer: shared_layers.0.bias | Grad Mean: 0.050697 | Grad Max: 0.203648
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.007236
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010883 | Grad Max: 0.010883
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000451 | Grad Max: 0.103377
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007579 | Grad Max: 0.580788
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.002616
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001428 | Grad Max: 0.013343
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000186
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000249 | Grad Max: 0.001585
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000061
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000066 | Grad Max: 0.000347
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000161 | Grad Max: 0.000727
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000005 | Grad Max: 0.000005
[GRADIENT NORM TOTAL] 1.3744

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.660
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005068 0.4994932] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.052
[MASKS] A(Pass/Fail): 685/1363 | B: 573/1475 | C: 299/1749
[LOSS Ex1] A: 0.65823 | B: 0.65141 | C: 0.64607
[LOGITS Ex2 A] Mean Abs: 1.845 | Max: 5.734
[LOSS Ex2] A: 0.15611 | B: 0.38364 | C: 0.29217
** [JOINT LOSS] ** : 0.929206
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003615 | Grad Max: 0.151979
  -> Layer: shared_layers.0.bias | Grad Mean: 0.072190 | Grad Max: 0.334765
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.005404
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001131 | Grad Max: 0.001131
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000680 | Grad Max: 0.069922
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011271 | Grad Max: 0.385466
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000092 | Grad Max: 0.004539
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004500 | Grad Max: 0.030901
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000223
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001108 | Grad Max: 0.003268
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000101
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000333 | Grad Max: 0.000972
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000492 | Grad Max: 0.001358
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008522 | Grad Max: 0.008522
[GRADIENT NORM TOTAL] 1.7060

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.441
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6470224  0.35297763] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.052
[MASKS] A(Pass/Fail): 668/1380 | B: 576/1472 | C: 279/1769
[LOSS Ex1] A: 0.65504 | B: 0.64757 | C: 0.64852
[LOGITS Ex2 A] Mean Abs: 1.855 | Max: 5.738
[LOSS Ex2] A: 0.17752 | B: 0.36145 | C: 0.27253
** [JOINT LOSS] ** : 0.920875
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004083 | Grad Max: 0.158371
  -> Layer: shared_layers.0.bias | Grad Mean: 0.173027 | Grad Max: 0.697464
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.005970
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004082 | Grad Max: 0.004082
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001232 | Grad Max: 0.153926
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021975 | Grad Max: 0.860011
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000186 | Grad Max: 0.006756
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009860 | Grad Max: 0.054284
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000312
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002210 | Grad Max: 0.004962
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000136
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000626 | Grad Max: 0.001524
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000727 | Grad Max: 0.002163
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014285 | Grad Max: 0.014285
[GRADIENT NORM TOTAL] 3.6321

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.518
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59265697 0.40734306] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.053
[MASKS] A(Pass/Fail): 554/1062 | B: 531/1325 | C: 282/1766
[LOSS Ex1] A: 0.65355 | B: 0.65127 | C: 0.64774
[LOGITS Ex2 A] Mean Abs: 1.911 | Max: 6.442
[LOSS Ex2] A: 0.15466 | B: 0.36337 | C: 0.26729
** [JOINT LOSS] ** : 0.912627
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003322 | Grad Max: 0.109196
  -> Layer: shared_layers.0.bias | Grad Mean: 0.077544 | Grad Max: 0.287559
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.006465
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000565 | Grad Max: 0.000565
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000654 | Grad Max: 0.101660
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011216 | Grad Max: 0.560014
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.002733
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003481 | Grad Max: 0.020482
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000181
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000812 | Grad Max: 0.002764
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000081
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000233 | Grad Max: 0.000794
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000422 | Grad Max: 0.001453
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004993 | Grad Max: 0.004993
[GRADIENT NORM TOTAL] 1.7912

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.662
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50695986 0.49304014] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.544 | Std: 0.053
[MASKS] A(Pass/Fail): 681/1367 | B: 568/1480 | C: 301/1747
[LOSS Ex1] A: 0.65394 | B: 0.65040 | C: 0.64618
[LOGITS Ex2 A] Mean Abs: 1.852 | Max: 5.836
[LOSS Ex2] A: 0.16092 | B: 0.40089 | C: 0.31621
** [JOINT LOSS] ** : 0.942846
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002926 | Grad Max: 0.087344
  -> Layer: shared_layers.0.bias | Grad Mean: 0.213648 | Grad Max: 1.023857
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.006235
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001747 | Grad Max: 0.001747
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001467 | Grad Max: 0.242605
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026443 | Grad Max: 1.355647
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000225 | Grad Max: 0.008899
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012123 | Grad Max: 0.068545
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000345
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002509 | Grad Max: 0.005537
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000151
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000727 | Grad Max: 0.001805
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000901 | Grad Max: 0.001806
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017392 | Grad Max: 0.017392
[GRADIENT NORM TOTAL] 4.6766

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.615
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104789  0.48952106] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.544 | Std: 0.053
[MASKS] A(Pass/Fail): 668/1380 | B: 573/1475 | C: 319/1729
[LOSS Ex1] A: 0.65160 | B: 0.65113 | C: 0.64264
[LOGITS Ex2 A] Mean Abs: 1.867 | Max: 6.049
[LOSS Ex2] A: 0.15968 | B: 0.38782 | C: 0.29235
** [JOINT LOSS] ** : 0.928407
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002776 | Grad Max: 0.084195
  -> Layer: shared_layers.0.bias | Grad Mean: 0.215889 | Grad Max: 0.993085
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002243 | Grad Max: 0.006540
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000347 | Grad Max: 0.000347
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001464 | Grad Max: 0.212727
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027338 | Grad Max: 1.195763
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000246 | Grad Max: 0.009312
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013381 | Grad Max: 0.079547
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000361
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002871 | Grad Max: 0.006406
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000169
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000831 | Grad Max: 0.002068
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001006 | Grad Max: 0.001906
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019339 | Grad Max: 0.019339
[GRADIENT NORM TOTAL] 4.6895

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.635
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5024326  0.49756747] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.544 | Std: 0.053
[MASKS] A(Pass/Fail): 668/1380 | B: 577/1471 | C: 297/1751
[LOSS Ex1] A: 0.65058 | B: 0.64728 | C: 0.64591
[LOGITS Ex2 A] Mean Abs: 1.874 | Max: 6.684
[LOSS Ex2] A: 0.18199 | B: 0.35464 | C: 0.30848
** [JOINT LOSS] ** : 0.929626
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005041 | Grad Max: 0.156248
  -> Layer: shared_layers.0.bias | Grad Mean: 0.269874 | Grad Max: 0.924642
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002327 | Grad Max: 0.007141
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001807 | Grad Max: 0.001807
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001759 | Grad Max: 0.191732
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031743 | Grad Max: 1.083912
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.009703
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015406 | Grad Max: 0.080803
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000456
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003423 | Grad Max: 0.008102
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000199
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000956 | Grad Max: 0.002358
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001003 | Grad Max: 0.002218
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019616 | Grad Max: 0.019616
[GRADIENT NORM TOTAL] 5.1261

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.527
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50386965 0.49613038] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.542 | Std: 0.052
[MASKS] A(Pass/Fail): 647/1401 | B: 532/1324 | C: 309/1739
[LOSS Ex1] A: 0.65743 | B: 0.65100 | C: 0.64718
[LOGITS Ex2 A] Mean Abs: 1.850 | Max: 5.752
[LOSS Ex2] A: 0.15278 | B: 0.36034 | C: 0.26409
** [JOINT LOSS] ** : 0.910937
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002499 | Grad Max: 0.066421
  -> Layer: shared_layers.0.bias | Grad Mean: 0.175316 | Grad Max: 0.692659
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.005339
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002435 | Grad Max: 0.002435
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001158 | Grad Max: 0.202857
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021048 | Grad Max: 1.134658
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.006109
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009008 | Grad Max: 0.046128
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000345
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001941 | Grad Max: 0.004658
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000123
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000561 | Grad Max: 0.001473
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000628 | Grad Max: 0.002384
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012907 | Grad Max: 0.012907
[GRADIENT NORM TOTAL] 3.7130

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.458
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53754354 0.4624564 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.051
[MASKS] A(Pass/Fail): 640/1408 | B: 570/1478 | C: 313/1735
[LOSS Ex1] A: 0.65720 | B: 0.65014 | C: 0.64544
[LOGITS Ex2 A] Mean Abs: 1.802 | Max: 6.527
[LOSS Ex2] A: 0.16202 | B: 0.39034 | C: 0.27333
** [JOINT LOSS] ** : 0.926156
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004074 | Grad Max: 0.111585
  -> Layer: shared_layers.0.bias | Grad Mean: 0.201297 | Grad Max: 0.827602
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.006180
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005935 | Grad Max: 0.005935
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001321 | Grad Max: 0.226707
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024192 | Grad Max: 1.279555
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000212 | Grad Max: 0.006656
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011379 | Grad Max: 0.053713
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000407
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002502 | Grad Max: 0.005803
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000159
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000714 | Grad Max: 0.001814
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000862 | Grad Max: 0.002005
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015952 | Grad Max: 0.015952
[GRADIENT NORM TOTAL] 4.1988

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.594
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6836789 0.3163211] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.053
[MASKS] A(Pass/Fail): 697/1351 | B: 573/1475 | C: 221/1155
[LOSS Ex1] A: 0.65244 | B: 0.65088 | C: 0.64490
[LOGITS Ex2 A] Mean Abs: 1.870 | Max: 6.060
[LOSS Ex2] A: 0.14830 | B: 0.37986 | C: 0.29265
** [JOINT LOSS] ** : 0.923013
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003143 | Grad Max: 0.081301
  -> Layer: shared_layers.0.bias | Grad Mean: 0.094740 | Grad Max: 0.370645
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006419
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001831 | Grad Max: 0.001831
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000713 | Grad Max: 0.127491
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012843 | Grad Max: 0.697899
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000110 | Grad Max: 0.005871
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005859 | Grad Max: 0.042330
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000319
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001299 | Grad Max: 0.004187
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000103
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000374 | Grad Max: 0.001105
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000495 | Grad Max: 0.001610
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008682 | Grad Max: 0.008682
[GRADIENT NORM TOTAL] 2.0956

[EPOCH SUMMARY] Train Loss: 0.9254

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9099 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 94/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.666
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50059867 0.49940136] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.052
[MASKS] A(Pass/Fail): 686/1362 | B: 577/1471 | C: 325/1723
[LOSS Ex1] A: 0.65786 | B: 0.64703 | C: 0.64720
[LOGITS Ex2 A] Mean Abs: 1.897 | Max: 5.824
[LOSS Ex2] A: 0.15172 | B: 0.37418 | C: 0.31330
** [JOINT LOSS] ** : 0.930429
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003410 | Grad Max: 0.167050
  -> Layer: shared_layers.0.bias | Grad Mean: 0.393950 | Grad Max: 2.036755
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.006072
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007418 | Grad Max: 0.007418
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002502 | Grad Max: 0.333904
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046451 | Grad Max: 1.866511
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000422 | Grad Max: 0.016837
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023402 | Grad Max: 0.130004
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000621
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004909 | Grad Max: 0.011024
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000268
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001383 | Grad Max: 0.003242
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001446 | Grad Max: 0.002972
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030060 | Grad Max: 0.030060
[GRADIENT NORM TOTAL] 8.2414

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.447
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64879954 0.35120043] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.053
[MASKS] A(Pass/Fail): 669/1379 | B: 532/1324 | C: 319/1729
[LOSS Ex1] A: 0.65464 | B: 0.65077 | C: 0.64446
[LOGITS Ex2 A] Mean Abs: 1.890 | Max: 6.228
[LOSS Ex2] A: 0.17786 | B: 0.35962 | C: 0.27432
** [JOINT LOSS] ** : 0.920552
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003436 | Grad Max: 0.089997
  -> Layer: shared_layers.0.bias | Grad Mean: 0.259491 | Grad Max: 1.116624
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.006681
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003235 | Grad Max: 0.003235
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001770 | Grad Max: 0.257933
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032716 | Grad Max: 1.437705
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000280 | Grad Max: 0.010635
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015478 | Grad Max: 0.094235
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000416
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003382 | Grad Max: 0.007164
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000176
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000971 | Grad Max: 0.002216
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001094 | Grad Max: 0.002542
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021927 | Grad Max: 0.021927
[GRADIENT NORM TOTAL] 5.6039

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.524
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5937136 0.4062864] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.053
[MASKS] A(Pass/Fail): 558/1058 | B: 570/1478 | C: 353/1695
[LOSS Ex1] A: 0.65315 | B: 0.64990 | C: 0.64035
[LOGITS Ex2 A] Mean Abs: 1.920 | Max: 5.690
[LOSS Ex2] A: 0.15676 | B: 0.38217 | C: 0.27585
** [JOINT LOSS] ** : 0.919392
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001834 | Grad Max: 0.054574
  -> Layer: shared_layers.0.bias | Grad Mean: 0.110690 | Grad Max: 0.508863
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.006937
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005334 | Grad Max: 0.005334
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000764 | Grad Max: 0.177208
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013670 | Grad Max: 0.996782
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000099 | Grad Max: 0.004160
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005135 | Grad Max: 0.030032
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000261
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001095 | Grad Max: 0.003512
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000091
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000312 | Grad Max: 0.000905
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000411 | Grad Max: 0.001270
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007238 | Grad Max: 0.007238
[GRADIENT NORM TOTAL] 2.5197

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.668
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.506917 0.493083] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.545 | Std: 0.054
[MASKS] A(Pass/Fail): 683/1365 | B: 573/1475 | C: 314/1734
[LOSS Ex1] A: 0.65355 | B: 0.65065 | C: 0.64590
[LOGITS Ex2 A] Mean Abs: 1.879 | Max: 6.400
[LOSS Ex2] A: 0.15356 | B: 0.38044 | C: 0.28284
** [JOINT LOSS] ** : 0.922319
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.070122
  -> Layer: shared_layers.0.bias | Grad Mean: 0.128331 | Grad Max: 0.617764
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.006336
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004969 | Grad Max: 0.004969
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000793 | Grad Max: 0.337195
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014149 | Grad Max: 1.891307
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.003927
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005312 | Grad Max: 0.027226
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000193
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001102 | Grad Max: 0.003435
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000107
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000293 | Grad Max: 0.001132
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000404 | Grad Max: 0.001517
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006167 | Grad Max: 0.006167
[GRADIENT NORM TOTAL] 3.3374

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.622
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105466  0.48945338] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.545 | Std: 0.053
[MASKS] A(Pass/Fail): 669/1379 | B: 578/1470 | C: 323/1725
[LOSS Ex1] A: 0.65120 | B: 0.64679 | C: 0.64488
[LOGITS Ex2 A] Mean Abs: 1.902 | Max: 6.255
[LOSS Ex2] A: 0.15565 | B: 0.36219 | C: 0.28684
** [JOINT LOSS] ** : 0.915851
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003191 | Grad Max: 0.078905
  -> Layer: shared_layers.0.bias | Grad Mean: 0.103836 | Grad Max: 0.524116
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002281 | Grad Max: 0.006705
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003866 | Grad Max: 0.003866
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000796 | Grad Max: 0.128161
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014482 | Grad Max: 0.713685
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000123 | Grad Max: 0.004999
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006524 | Grad Max: 0.034178
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000287
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001456 | Grad Max: 0.003885
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000113
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000416 | Grad Max: 0.001111
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000519 | Grad Max: 0.001776
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009631 | Grad Max: 0.009631
[GRADIENT NORM TOTAL] 2.3324

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.642
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50243807 0.49756187] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.544 | Std: 0.053
[MASKS] A(Pass/Fail): 670/1378 | B: 533/1323 | C: 320/1728
[LOSS Ex1] A: 0.65019 | B: 0.65054 | C: 0.64553
[LOGITS Ex2 A] Mean Abs: 1.867 | Max: 7.368
[LOSS Ex2] A: 0.16788 | B: 0.36542 | C: 0.28528
** [JOINT LOSS] ** : 0.921614
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003068 | Grad Max: 0.058330
  -> Layer: shared_layers.0.bias | Grad Mean: 0.172327 | Grad Max: 0.809718
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002228 | Grad Max: 0.007050
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002179 | Grad Max: 0.002179
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001099 | Grad Max: 0.096728
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019363 | Grad Max: 0.531295
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.006588
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009865 | Grad Max: 0.050871
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000330
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001998 | Grad Max: 0.005191
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000150
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000521 | Grad Max: 0.001587
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000483 | Grad Max: 0.001482
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009483 | Grad Max: 0.009483
[GRADIENT NORM TOTAL] 3.1990

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.533
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037916 0.4962085] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.052
[MASKS] A(Pass/Fail): 648/1400 | B: 571/1477 | C: 311/1737
[LOSS Ex1] A: 0.65709 | B: 0.64966 | C: 0.64599
[LOGITS Ex2 A] Mean Abs: 1.850 | Max: 6.228
[LOSS Ex2] A: 0.14682 | B: 0.38881 | C: 0.26720
** [JOINT LOSS] ** : 0.918527
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001889 | Grad Max: 0.052539
  -> Layer: shared_layers.0.bias | Grad Mean: 0.026977 | Grad Max: 0.146009
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.005680
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004096 | Grad Max: 0.004096
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000339 | Grad Max: 0.061791
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.005455 | Grad Max: 0.323886
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.002344
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001524 | Grad Max: 0.014093
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000150
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000263 | Grad Max: 0.002147
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000065
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000074 | Grad Max: 0.000595
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000444 | Grad Max: 0.001217
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001130 | Grad Max: 0.001130
[GRADIENT NORM TOTAL] 0.9134

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.464
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53784883 0.46215117] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.051
[MASKS] A(Pass/Fail): 641/1407 | B: 573/1475 | C: 336/1712
[LOSS Ex1] A: 0.65687 | B: 0.65041 | C: 0.64451
[LOGITS Ex2 A] Mean Abs: 1.861 | Max: 6.131
[LOSS Ex2] A: 0.16965 | B: 0.38139 | C: 0.28696
** [JOINT LOSS] ** : 0.929929
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002282 | Grad Max: 0.103971
  -> Layer: shared_layers.0.bias | Grad Mean: 0.262384 | Grad Max: 1.264064
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.007002
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010511 | Grad Max: 0.010511
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001661 | Grad Max: 0.259986
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030811 | Grad Max: 1.461378
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000249 | Grad Max: 0.010144
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013801 | Grad Max: 0.085307
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000360
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002946 | Grad Max: 0.006465
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000189
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000817 | Grad Max: 0.002019
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000790 | Grad Max: 0.001984
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016742 | Grad Max: 0.016742
[GRADIENT NORM TOTAL] 5.5673

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.600
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6857831  0.31421688] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.054
[MASKS] A(Pass/Fail): 697/1351 | B: 580/1468 | C: 324/1724
[LOSS Ex1] A: 0.65206 | B: 0.64653 | C: 0.64329
[LOGITS Ex2 A] Mean Abs: 1.899 | Max: 6.401
[LOSS Ex2] A: 0.15110 | B: 0.36279 | C: 0.28708
** [JOINT LOSS] ** : 0.914280
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002026 | Grad Max: 0.051933
  -> Layer: shared_layers.0.bias | Grad Mean: 0.162536 | Grad Max: 0.623986
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002304 | Grad Max: 0.006682
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005554 | Grad Max: 0.005554
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001097 | Grad Max: 0.150931
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019725 | Grad Max: 0.850115
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000150 | Grad Max: 0.006678
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008262 | Grad Max: 0.051616
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000277
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001790 | Grad Max: 0.004879
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000117
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000511 | Grad Max: 0.001196
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000566 | Grad Max: 0.001719
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011598 | Grad Max: 0.011598
[GRADIENT NORM TOTAL] 3.4376

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.673
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006405 0.4993595] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.053
[MASKS] A(Pass/Fail): 687/1361 | B: 533/1323 | C: 314/1734
[LOSS Ex1] A: 0.65751 | B: 0.65028 | C: 0.64631
[LOGITS Ex2 A] Mean Abs: 1.867 | Max: 6.047
[LOSS Ex2] A: 0.14992 | B: 0.36528 | C: 0.29211
** [JOINT LOSS] ** : 0.920470
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008375 | Grad Max: 0.266832
  -> Layer: shared_layers.0.bias | Grad Mean: 0.334491 | Grad Max: 1.378706
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.005345
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001220 | Grad Max: 0.001220
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002320 | Grad Max: 0.235388
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042505 | Grad Max: 1.324685
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.011834
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021186 | Grad Max: 0.102795
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000512
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004696 | Grad Max: 0.009471
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000267
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001317 | Grad Max: 0.003041
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001625 | Grad Max: 0.003020
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029206 | Grad Max: 0.029206
[GRADIENT NORM TOTAL] 6.4413

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.453
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6503997 0.3496003] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.054
[MASKS] A(Pass/Fail): 671/1377 | B: 571/1477 | C: 333/1715
[LOSS Ex1] A: 0.65426 | B: 0.64941 | C: 0.64339
[LOGITS Ex2 A] Mean Abs: 1.872 | Max: 6.735
[LOSS Ex2] A: 0.16564 | B: 0.39060 | C: 0.27553
** [JOINT LOSS] ** : 0.926278
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007627 | Grad Max: 0.233659
  -> Layer: shared_layers.0.bias | Grad Mean: 0.362044 | Grad Max: 1.526210
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.006181
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001175 | Grad Max: 0.001175
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002438 | Grad Max: 0.288463
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045063 | Grad Max: 1.491289
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000397 | Grad Max: 0.012462
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021695 | Grad Max: 0.109409
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000603
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004776 | Grad Max: 0.009780
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000272
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001334 | Grad Max: 0.003046
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001525 | Grad Max: 0.002748
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029012 | Grad Max: 0.029012
[GRADIENT NORM TOTAL] 7.1553

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.531
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59464306 0.4053569 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.054
[MASKS] A(Pass/Fail): 558/1058 | B: 573/1475 | C: 313/1735
[LOSS Ex1] A: 0.65277 | B: 0.65017 | C: 0.64415
[LOGITS Ex2 A] Mean Abs: 1.941 | Max: 6.261
[LOSS Ex2] A: 0.16122 | B: 0.37455 | C: 0.27608
** [JOINT LOSS] ** : 0.919643
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.062991
  -> Layer: shared_layers.0.bias | Grad Mean: 0.084707 | Grad Max: 0.471138
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.006739
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007973 | Grad Max: 0.007973
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000708 | Grad Max: 0.168721
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012191 | Grad Max: 0.945031
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000066 | Grad Max: 0.004050
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003209 | Grad Max: 0.027569
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000172
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000647 | Grad Max: 0.003064
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000101
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000168 | Grad Max: 0.000874
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000277 | Grad Max: 0.000875
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001732 | Grad Max: 0.001732
[GRADIENT NORM TOTAL] 2.5010

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.675
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50698155 0.49301842] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.545 | Std: 0.054
[MASKS] A(Pass/Fail): 684/1364 | B: 581/1467 | C: 321/1727
[LOSS Ex1] A: 0.65318 | B: 0.64628 | C: 0.64362
[LOGITS Ex2 A] Mean Abs: 1.947 | Max: 6.324
[LOSS Ex2] A: 0.15795 | B: 0.35944 | C: 0.32313
** [JOINT LOSS] ** : 0.927866
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005019 | Grad Max: 0.136536
  -> Layer: shared_layers.0.bias | Grad Mean: 0.366330 | Grad Max: 1.706127
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006122
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001594 | Grad Max: 0.001594
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002388 | Grad Max: 0.270911
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044035 | Grad Max: 1.509097
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000411 | Grad Max: 0.014969
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022675 | Grad Max: 0.122187
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000620
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004940 | Grad Max: 0.010771
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000288
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001368 | Grad Max: 0.003448
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001465 | Grad Max: 0.002796
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028841 | Grad Max: 0.028841
[GRADIENT NORM TOTAL] 7.3839

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.629
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105151  0.48948497] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.545 | Std: 0.054
[MASKS] A(Pass/Fail): 670/1378 | B: 533/1323 | C: 213/1163
[LOSS Ex1] A: 0.65080 | B: 0.65006 | C: 0.64507
[LOGITS Ex2 A] Mean Abs: 1.940 | Max: 6.253
[LOSS Ex2] A: 0.15934 | B: 0.35523 | C: 0.28940
** [JOINT LOSS] ** : 0.916632
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004295 | Grad Max: 0.124322
  -> Layer: shared_layers.0.bias | Grad Mean: 0.156461 | Grad Max: 0.629805
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.007087
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010112 | Grad Max: 0.010112
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001193 | Grad Max: 0.173119
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021104 | Grad Max: 0.955830
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000204 | Grad Max: 0.006280
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010819 | Grad Max: 0.049673
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000313
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002417 | Grad Max: 0.005562
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000168
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000683 | Grad Max: 0.001751
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000855 | Grad Max: 0.001999
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015580 | Grad Max: 0.015580
[GRADIENT NORM TOTAL] 3.2876

[EPOCH SUMMARY] Train Loss: 0.9217

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9083 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 95/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.649
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025354 0.4974646] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.545 | Std: 0.054
[MASKS] A(Pass/Fail): 671/1377 | B: 571/1477 | C: 331/1717
[LOSS Ex1] A: 0.64979 | B: 0.64919 | C: 0.64341
[LOGITS Ex2 A] Mean Abs: 1.866 | Max: 8.249
[LOSS Ex2] A: 0.16750 | B: 0.41410 | C: 0.26060
** [JOINT LOSS] ** : 0.928197
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005639 | Grad Max: 0.160169
  -> Layer: shared_layers.0.bias | Grad Mean: 0.527535 | Grad Max: 2.250470
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002300 | Grad Max: 0.007001
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004425 | Grad Max: 0.004425
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003193 | Grad Max: 0.546493
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059821 | Grad Max: 3.085160
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000508 | Grad Max: 0.018525
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028232 | Grad Max: 0.151461
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000792
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005967 | Grad Max: 0.012679
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000336
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001649 | Grad Max: 0.003923
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001774 | Grad Max: 0.003227
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035210 | Grad Max: 0.035210
[GRADIENT NORM TOTAL] 10.6143

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.538
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50380105 0.49619892] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.053
[MASKS] A(Pass/Fail): 648/1400 | B: 573/1475 | C: 298/1750
[LOSS Ex1] A: 0.65674 | B: 0.64995 | C: 0.64724
[LOGITS Ex2 A] Mean Abs: 1.833 | Max: 5.960
[LOSS Ex2] A: 0.15232 | B: 0.41675 | C: 0.29010
** [JOINT LOSS] ** : 0.937705
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006994 | Grad Max: 0.205721
  -> Layer: shared_layers.0.bias | Grad Mean: 0.684926 | Grad Max: 2.771009
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002018 | Grad Max: 0.005480
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003252 | Grad Max: 0.003252
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004204 | Grad Max: 0.578660
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.079479 | Grad Max: 3.267402
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000707 | Grad Max: 0.022840
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039758 | Grad Max: 0.199027
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.000986
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008554 | Grad Max: 0.017102
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000452
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002381 | Grad Max: 0.005461
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002683 | Grad Max: 0.005058
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.051953 | Grad Max: 0.051953
[GRADIENT NORM TOTAL] 13.4287

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.469
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5379829  0.46201715] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.542 | Std: 0.052
[MASKS] A(Pass/Fail): 642/1406 | B: 581/1467 | C: 297/1751
[LOSS Ex1] A: 0.65654 | B: 0.64607 | C: 0.64563
[LOGITS Ex2 A] Mean Abs: 1.820 | Max: 6.646
[LOSS Ex2] A: 0.16374 | B: 0.37791 | C: 0.27636
** [JOINT LOSS] ** : 0.922084
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004241 | Grad Max: 0.132382
  -> Layer: shared_layers.0.bias | Grad Mean: 0.462281 | Grad Max: 1.871760
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006087
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004991 | Grad Max: 0.004991
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002784 | Grad Max: 0.413776
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052175 | Grad Max: 2.327711
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000470 | Grad Max: 0.015399
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026451 | Grad Max: 0.133797
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000670
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005692 | Grad Max: 0.012215
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000298
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001564 | Grad Max: 0.003514
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001681 | Grad Max: 0.003240
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032810 | Grad Max: 0.032810
[GRADIENT NORM TOTAL] 9.0525

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.606
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6875627  0.31243733] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.054
[MASKS] A(Pass/Fail): 698/1350 | B: 534/1322 | C: 352/1696
[LOSS Ex1] A: 0.65173 | B: 0.64987 | C: 0.64473
[LOGITS Ex2 A] Mean Abs: 1.936 | Max: 6.048
[LOSS Ex2] A: 0.15329 | B: 0.35266 | C: 0.28110
** [JOINT LOSS] ** : 0.911122
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005415 | Grad Max: 0.184112
  -> Layer: shared_layers.0.bias | Grad Mean: 0.132979 | Grad Max: 0.494975
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.006469
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004833 | Grad Max: 0.004833
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001095 | Grad Max: 0.163435
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018904 | Grad Max: 0.930603
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000162 | Grad Max: 0.005367
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008530 | Grad Max: 0.037219
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000269
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001991 | Grad Max: 0.004718
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000143
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000564 | Grad Max: 0.001352
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000715 | Grad Max: 0.001856
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012701 | Grad Max: 0.012701
[GRADIENT NORM TOTAL] 3.0521

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.678
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005821 0.4994179] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.053
[MASKS] A(Pass/Fail): 688/1360 | B: 572/1476 | C: 376/1672
[LOSS Ex1] A: 0.65722 | B: 0.64900 | C: 0.63880
[LOGITS Ex2 A] Mean Abs: 1.954 | Max: 6.042
[LOSS Ex2] A: 0.15339 | B: 0.38495 | C: 0.25598
** [JOINT LOSS] ** : 0.913115
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003221 | Grad Max: 0.089909
  -> Layer: shared_layers.0.bias | Grad Mean: 0.250428 | Grad Max: 1.150944
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.005882
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003075 | Grad Max: 0.003075
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001575 | Grad Max: 0.200394
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029403 | Grad Max: 1.117517
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000266 | Grad Max: 0.009367
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014602 | Grad Max: 0.074623
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000383
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003027 | Grad Max: 0.006740
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000193
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000836 | Grad Max: 0.001982
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000851 | Grad Max: 0.002230
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017411 | Grad Max: 0.017411
[GRADIENT NORM TOTAL] 5.1473

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.457
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.65160936 0.34839067] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.054
[MASKS] A(Pass/Fail): 671/1377 | B: 574/1474 | C: 296/1752
[LOSS Ex1] A: 0.65395 | B: 0.64978 | C: 0.64636
[LOGITS Ex2 A] Mean Abs: 1.931 | Max: 6.034
[LOSS Ex2] A: 0.16987 | B: 0.38479 | C: 0.28297
** [JOINT LOSS] ** : 0.929240
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003009 | Grad Max: 0.118189
  -> Layer: shared_layers.0.bias | Grad Mean: 0.133539 | Grad Max: 0.617649
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002120 | Grad Max: 0.006054
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001187 | Grad Max: 0.001187
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001078 | Grad Max: 0.262906
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018350 | Grad Max: 1.484137
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.006227
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006360 | Grad Max: 0.049217
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000261
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001179 | Grad Max: 0.003702
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000333 | Grad Max: 0.001040
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000351 | Grad Max: 0.001092
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007700 | Grad Max: 0.007700
[GRADIENT NORM TOTAL] 3.3885

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.536
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5953082  0.40469185] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.054
[MASKS] A(Pass/Fail): 559/1057 | B: 583/1465 | C: 322/1726
[LOSS Ex1] A: 0.65245 | B: 0.64589 | C: 0.64397
[LOGITS Ex2 A] Mean Abs: 1.928 | Max: 5.968
[LOSS Ex2] A: 0.15797 | B: 0.37491 | C: 0.28946
** [JOINT LOSS] ** : 0.921553
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007445 | Grad Max: 0.228778
  -> Layer: shared_layers.0.bias | Grad Mean: 0.329196 | Grad Max: 1.321798
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002257 | Grad Max: 0.006255
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003977 | Grad Max: 0.003977
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002315 | Grad Max: 0.405232
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041794 | Grad Max: 2.265126
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000365 | Grad Max: 0.011840
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019769 | Grad Max: 0.096096
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000540
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004441 | Grad Max: 0.009828
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000265
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001244 | Grad Max: 0.002910
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001544 | Grad Max: 0.002883
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027837 | Grad Max: 0.027837
[GRADIENT NORM TOTAL] 6.8702

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.680
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070216  0.49297833] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.546 | Std: 0.055
[MASKS] A(Pass/Fail): 685/1363 | B: 535/1321 | C: 328/1720
[LOSS Ex1] A: 0.65288 | B: 0.64969 | C: 0.64532
[LOGITS Ex2 A] Mean Abs: 1.937 | Max: 6.563
[LOSS Ex2] A: 0.14920 | B: 0.36861 | C: 0.29089
** [JOINT LOSS] ** : 0.918863
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006087 | Grad Max: 0.208961
  -> Layer: shared_layers.0.bias | Grad Mean: 0.349724 | Grad Max: 1.349849
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006362
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003453 | Grad Max: 0.003453
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002334 | Grad Max: 0.288815
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042769 | Grad Max: 1.582353
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000396 | Grad Max: 0.012947
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021901 | Grad Max: 0.100235
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000560
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004789 | Grad Max: 0.009834
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000286
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001324 | Grad Max: 0.003021
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001461 | Grad Max: 0.002551
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027969 | Grad Max: 0.027969
[GRADIENT NORM TOTAL] 6.8330

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.634
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51052964 0.48947036] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.545 | Std: 0.054
[MASKS] A(Pass/Fail): 670/1378 | B: 573/1475 | C: 334/1714
[LOSS Ex1] A: 0.65049 | B: 0.64883 | C: 0.64394
[LOGITS Ex2 A] Mean Abs: 1.945 | Max: 6.028
[LOSS Ex2] A: 0.14719 | B: 0.38461 | C: 0.31787
** [JOINT LOSS] ** : 0.930979
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002720 | Grad Max: 0.065669
  -> Layer: shared_layers.0.bias | Grad Mean: 0.129176 | Grad Max: 0.698761
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002327 | Grad Max: 0.006949
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009562 | Grad Max: 0.009562
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000889 | Grad Max: 0.132039
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015749 | Grad Max: 0.745684
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.004849
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005998 | Grad Max: 0.035837
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000253
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001202 | Grad Max: 0.003638
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000108
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000311 | Grad Max: 0.001027
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000322 | Grad Max: 0.000920
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005527 | Grad Max: 0.005527
[GRADIENT NORM TOTAL] 2.7244

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.654
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025835  0.49741647] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.545 | Std: 0.054
[MASKS] A(Pass/Fail): 671/1377 | B: 575/1473 | C: 320/1728
[LOSS Ex1] A: 0.64949 | B: 0.64961 | C: 0.64292
[LOGITS Ex2 A] Mean Abs: 1.918 | Max: 6.990
[LOSS Ex2] A: 0.17340 | B: 0.38335 | C: 0.29780
** [JOINT LOSS] ** : 0.932187
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003958 | Grad Max: 0.183119
  -> Layer: shared_layers.0.bias | Grad Mean: 0.162812 | Grad Max: 0.674775
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002273 | Grad Max: 0.006429
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001032 | Grad Max: 0.001032
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001197 | Grad Max: 0.137288
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020990 | Grad Max: 0.766810
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.005503
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009319 | Grad Max: 0.041284
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000324
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002115 | Grad Max: 0.005359
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000153
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000571 | Grad Max: 0.001635
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000510 | Grad Max: 0.001315
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010000 | Grad Max: 0.010000
[GRADIENT NORM TOTAL] 3.2415

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.543
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037752  0.49622482] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.543 | Std: 0.053
[MASKS] A(Pass/Fail): 649/1399 | B: 583/1465 | C: 356/1692
[LOSS Ex1] A: 0.65648 | B: 0.64571 | C: 0.63951
[LOGITS Ex2 A] Mean Abs: 1.867 | Max: 6.660
[LOSS Ex2] A: 0.15149 | B: 0.36338 | C: 0.27657
** [JOINT LOSS] ** : 0.911049
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002437 | Grad Max: 0.077779
  -> Layer: shared_layers.0.bias | Grad Mean: 0.200605 | Grad Max: 1.069011
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.006346
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005502 | Grad Max: 0.005502
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001363 | Grad Max: 0.230979
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025235 | Grad Max: 1.299665
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000240 | Grad Max: 0.008697
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013531 | Grad Max: 0.075741
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000402
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002849 | Grad Max: 0.006474
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000178
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000780 | Grad Max: 0.001873
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000839 | Grad Max: 0.002153
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016541 | Grad Max: 0.016541
[GRADIENT NORM TOTAL] 4.2594

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.473
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.538192   0.46180797] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.052
[MASKS] A(Pass/Fail): 642/1406 | B: 535/1321 | C: 334/1714
[LOSS Ex1] A: 0.65628 | B: 0.64952 | C: 0.64381
[LOGITS Ex2 A] Mean Abs: 1.848 | Max: 6.091
[LOSS Ex2] A: 0.16384 | B: 0.35375 | C: 0.27194
** [JOINT LOSS] ** : 0.913046
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001728 | Grad Max: 0.028257
  -> Layer: shared_layers.0.bias | Grad Mean: 0.107623 | Grad Max: 0.581931
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.005953
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002961 | Grad Max: 0.002961
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000809 | Grad Max: 0.156247
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014485 | Grad Max: 0.877415
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.004809
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006144 | Grad Max: 0.036561
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000231
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001248 | Grad Max: 0.004075
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000106
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000334 | Grad Max: 0.000967
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000405 | Grad Max: 0.001311
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007062 | Grad Max: 0.007062
[GRADIENT NORM TOTAL] 2.4131

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.611
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6891596  0.31084046] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.055
[MASKS] A(Pass/Fail): 698/1350 | B: 574/1474 | C: 324/1724
[LOSS Ex1] A: 0.65143 | B: 0.64864 | C: 0.64354
[LOGITS Ex2 A] Mean Abs: 1.922 | Max: 5.829
[LOSS Ex2] A: 0.16209 | B: 0.37690 | C: 0.29753
** [JOINT LOSS] ** : 0.926711
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007499 | Grad Max: 0.249016
  -> Layer: shared_layers.0.bias | Grad Mean: 0.422647 | Grad Max: 1.521842
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.006758
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005212 | Grad Max: 0.005212
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002786 | Grad Max: 0.265831
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051409 | Grad Max: 1.427521
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000462 | Grad Max: 0.014240
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025506 | Grad Max: 0.121124
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000636
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005595 | Grad Max: 0.011312
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000314
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001560 | Grad Max: 0.003646
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001684 | Grad Max: 0.003434
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032897 | Grad Max: 0.032897
[GRADIENT NORM TOTAL] 8.0389

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.684
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006233 0.4993767] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.054
[MASKS] A(Pass/Fail): 688/1360 | B: 575/1473 | C: 224/1152
[LOSS Ex1] A: 0.65695 | B: 0.64942 | C: 0.64461
[LOGITS Ex2 A] Mean Abs: 1.915 | Max: 5.933
[LOSS Ex2] A: 0.15008 | B: 0.37895 | C: 0.30964
** [JOINT LOSS] ** : 0.929885
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003621 | Grad Max: 0.142012
  -> Layer: shared_layers.0.bias | Grad Mean: 0.383318 | Grad Max: 1.897784
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002030 | Grad Max: 0.005862
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004593 | Grad Max: 0.004593
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002477 | Grad Max: 0.325905
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046312 | Grad Max: 1.836332
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000398 | Grad Max: 0.012600
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022290 | Grad Max: 0.113344
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000575
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004681 | Grad Max: 0.009953
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000279
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001274 | Grad Max: 0.003387
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001282 | Grad Max: 0.002633
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025329 | Grad Max: 0.025329
[GRADIENT NORM TOTAL] 8.2012

[EPOCH SUMMARY] Train Loss: 0.9233

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.9004 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9066 -> New: 0.9004)

############################## EPOCH 96/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.462
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.65282863 0.3471714 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.055
[MASKS] A(Pass/Fail): 672/1376 | B: 583/1465 | C: 343/1705
[LOSS Ex1] A: 0.65367 | B: 0.64552 | C: 0.64389
[LOGITS Ex2 A] Mean Abs: 1.881 | Max: 5.733
[LOSS Ex2] A: 0.15994 | B: 0.36298 | C: 0.29075
** [JOINT LOSS] ** : 0.918913
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002945 | Grad Max: 0.067443
  -> Layer: shared_layers.0.bias | Grad Mean: 0.074574 | Grad Max: 0.329009
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.006187
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000313 | Grad Max: 0.000313
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000600 | Grad Max: 0.293885
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010055 | Grad Max: 1.646561
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.003160
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002698 | Grad Max: 0.014748
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000149
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000626 | Grad Max: 0.002306
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000083
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000179 | Grad Max: 0.000676
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000307 | Grad Max: 0.001134
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004209 | Grad Max: 0.004209
[GRADIENT NORM TOTAL] 2.5083

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.540
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.59604824 0.40395176] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.055
[MASKS] A(Pass/Fail): 562/1054 | B: 535/1321 | C: 342/1706
[LOSS Ex1] A: 0.65216 | B: 0.64934 | C: 0.63989
[LOGITS Ex2 A] Mean Abs: 1.886 | Max: 6.294
[LOSS Ex2] A: 0.15949 | B: 0.36875 | C: 0.27000
** [JOINT LOSS] ** : 0.913212
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006093 | Grad Max: 0.196240
  -> Layer: shared_layers.0.bias | Grad Mean: 0.242952 | Grad Max: 0.941032
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002237 | Grad Max: 0.006643
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006643 | Grad Max: 0.006643
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001600 | Grad Max: 0.238397
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029146 | Grad Max: 1.251453
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000273 | Grad Max: 0.009354
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015024 | Grad Max: 0.079867
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000415
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003342 | Grad Max: 0.007372
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000935 | Grad Max: 0.002374
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001095 | Grad Max: 0.002520
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020438 | Grad Max: 0.020438
[GRADIENT NORM TOTAL] 4.6204

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.686
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070279  0.49297202] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.546 | Std: 0.055
[MASKS] A(Pass/Fail): 685/1363 | B: 575/1473 | C: 356/1692
[LOSS Ex1] A: 0.65260 | B: 0.64846 | C: 0.63924
[LOGITS Ex2 A] Mean Abs: 1.906 | Max: 7.816
[LOSS Ex2] A: 0.15956 | B: 0.37810 | C: 0.26926
** [JOINT LOSS] ** : 0.915742
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002733 | Grad Max: 0.136429
  -> Layer: shared_layers.0.bias | Grad Mean: 0.057560 | Grad Max: 0.367193
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002256 | Grad Max: 0.006503
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003286 | Grad Max: 0.003286
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000495 | Grad Max: 0.119085
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007818 | Grad Max: 0.670085
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.004862
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001632 | Grad Max: 0.027185
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000120
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000262 | Grad Max: 0.001494
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000050
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000073 | Grad Max: 0.000366
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000338 | Grad Max: 0.000905
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000600 | Grad Max: 0.000600
[GRADIENT NORM TOTAL] 1.5736

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.640
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105924 0.4894076] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.546 | Std: 0.055
[MASKS] A(Pass/Fail): 670/1378 | B: 575/1473 | C: 329/1719
[LOSS Ex1] A: 0.65018 | B: 0.64923 | C: 0.64482
[LOGITS Ex2 A] Mean Abs: 1.909 | Max: 5.736
[LOSS Ex2] A: 0.16057 | B: 0.36709 | C: 0.27339
** [JOINT LOSS] ** : 0.915089
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003622 | Grad Max: 0.084126
  -> Layer: shared_layers.0.bias | Grad Mean: 0.174515 | Grad Max: 0.733827
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006538
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001468 | Grad Max: 0.001468
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001241 | Grad Max: 0.181908
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022746 | Grad Max: 0.934671
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.007335
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010350 | Grad Max: 0.053956
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000319
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002292 | Grad Max: 0.005506
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000141
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000642 | Grad Max: 0.001551
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000701 | Grad Max: 0.002139
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014272 | Grad Max: 0.014272
[GRADIENT NORM TOTAL] 3.7610

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.661
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50259775 0.49740222] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.545 | Std: 0.055
[MASKS] A(Pass/Fail): 671/1377 | B: 583/1465 | C: 378/1670
[LOSS Ex1] A: 0.64918 | B: 0.64532 | C: 0.63543
[LOGITS Ex2 A] Mean Abs: 1.882 | Max: 6.664
[LOSS Ex2] A: 0.16793 | B: 0.36451 | C: 0.26455
** [JOINT LOSS] ** : 0.908971
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003715 | Grad Max: 0.169901
  -> Layer: shared_layers.0.bias | Grad Mean: 0.050235 | Grad Max: 0.239643
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002415 | Grad Max: 0.006811
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000352 | Grad Max: 0.000352
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000606 | Grad Max: 0.129338
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008918 | Grad Max: 0.679771
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.003103
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002597 | Grad Max: 0.016190
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000204
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000679 | Grad Max: 0.002507
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000091
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000218 | Grad Max: 0.000844
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000367 | Grad Max: 0.001400
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005810 | Grad Max: 0.005810
[GRADIENT NORM TOTAL] 1.6805

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.547
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037513  0.49624872] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.054
[MASKS] A(Pass/Fail): 650/1398 | B: 536/1320 | C: 330/1718
[LOSS Ex1] A: 0.65620 | B: 0.64913 | C: 0.64390
[LOGITS Ex2 A] Mean Abs: 1.841 | Max: 5.661
[LOSS Ex2] A: 0.15141 | B: 0.36407 | C: 0.28654
** [JOINT LOSS] ** : 0.917084
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003607 | Grad Max: 0.103240
  -> Layer: shared_layers.0.bias | Grad Mean: 0.289124 | Grad Max: 1.226943
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.005566
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000742 | Grad Max: 0.000742
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001844 | Grad Max: 0.247787
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034318 | Grad Max: 1.375772
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.010585
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017673 | Grad Max: 0.085735
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000447
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003812 | Grad Max: 0.007730
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000228
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001046 | Grad Max: 0.002671
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001096 | Grad Max: 0.002374
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021606 | Grad Max: 0.021606
[GRADIENT NORM TOTAL] 5.5797

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.478
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5384094  0.46159056] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.052
[MASKS] A(Pass/Fail): 643/1405 | B: 576/1472 | C: 361/1687
[LOSS Ex1] A: 0.65601 | B: 0.64825 | C: 0.63861
[LOGITS Ex2 A] Mean Abs: 1.829 | Max: 6.945
[LOSS Ex2] A: 0.16952 | B: 0.38656 | C: 0.27433
** [JOINT LOSS] ** : 0.924426
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004697 | Grad Max: 0.157645
  -> Layer: shared_layers.0.bias | Grad Mean: 0.291073 | Grad Max: 1.213680
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.006031
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004450 | Grad Max: 0.004450
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.402839
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035926 | Grad Max: 2.259087
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.010536
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016803 | Grad Max: 0.085138
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000535
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003721 | Grad Max: 0.008742
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000222
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001038 | Grad Max: 0.002488
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001122 | Grad Max: 0.002164
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022024 | Grad Max: 0.022024
[GRADIENT NORM TOTAL] 6.2144

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.616
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.69079465 0.30920535] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.055
[MASKS] A(Pass/Fail): 698/1350 | B: 575/1473 | C: 302/1746
[LOSS Ex1] A: 0.65113 | B: 0.64902 | C: 0.64908
[LOGITS Ex2 A] Mean Abs: 1.919 | Max: 6.143
[LOSS Ex2] A: 0.15079 | B: 0.37253 | C: 0.28534
** [JOINT LOSS] ** : 0.919294
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004331 | Grad Max: 0.119956
  -> Layer: shared_layers.0.bias | Grad Mean: 0.252348 | Grad Max: 1.267571
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006698
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008157 | Grad Max: 0.008157
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001721 | Grad Max: 0.211721
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031844 | Grad Max: 1.182483
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000267 | Grad Max: 0.010416
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014895 | Grad Max: 0.083638
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000392
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003222 | Grad Max: 0.007221
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000192
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000886 | Grad Max: 0.002200
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000960 | Grad Max: 0.002462
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018776 | Grad Max: 0.018776
[GRADIENT NORM TOTAL] 5.3808

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.690
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50059927 0.49940073] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.054
[MASKS] A(Pass/Fail): 688/1360 | B: 584/1464 | C: 318/1730
[LOSS Ex1] A: 0.65669 | B: 0.64509 | C: 0.64578
[LOGITS Ex2 A] Mean Abs: 1.922 | Max: 5.716
[LOSS Ex2] A: 0.14807 | B: 0.36259 | C: 0.30354
** [JOINT LOSS] ** : 0.920586
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003847 | Grad Max: 0.132082
  -> Layer: shared_layers.0.bias | Grad Mean: 0.339030 | Grad Max: 1.715714
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005547
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002098 | Grad Max: 0.002098
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.258118
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040583 | Grad Max: 1.444995
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000351 | Grad Max: 0.012949
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019636 | Grad Max: 0.105320
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000549
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004034 | Grad Max: 0.008768
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000234
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001096 | Grad Max: 0.002698
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001034 | Grad Max: 0.002699
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022039 | Grad Max: 0.022039
[GRADIENT NORM TOTAL] 7.1605

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.466
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6540166 0.3459834] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.055
[MASKS] A(Pass/Fail): 673/1375 | B: 536/1320 | C: 303/1745
[LOSS Ex1] A: 0.65338 | B: 0.64894 | C: 0.64840
[LOGITS Ex2 A] Mean Abs: 1.898 | Max: 5.937
[LOSS Ex2] A: 0.16241 | B: 0.35768 | C: 0.30479
** [JOINT LOSS] ** : 0.925200
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004528 | Grad Max: 0.180752
  -> Layer: shared_layers.0.bias | Grad Mean: 0.089789 | Grad Max: 0.389979
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.006179
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003382 | Grad Max: 0.003382
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000819 | Grad Max: 0.257296
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012554 | Grad Max: 1.440299
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.003395
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002423 | Grad Max: 0.019616
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000190
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000555 | Grad Max: 0.002466
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000097
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000152 | Grad Max: 0.000702
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000218 | Grad Max: 0.000931
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002480 | Grad Max: 0.002480
[GRADIENT NORM TOTAL] 2.6876

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.545
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.596688  0.4033121] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.055
[MASKS] A(Pass/Fail): 564/1052 | B: 576/1472 | C: 304/1744
[LOSS Ex1] A: 0.65186 | B: 0.64805 | C: 0.64409
[LOGITS Ex2 A] Mean Abs: 1.926 | Max: 7.010
[LOSS Ex2] A: 0.15149 | B: 0.38339 | C: 0.28864
** [JOINT LOSS] ** : 0.922508
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003531 | Grad Max: 0.102368
  -> Layer: shared_layers.0.bias | Grad Mean: 0.281718 | Grad Max: 1.088805
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.006336
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004806 | Grad Max: 0.004806
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001796 | Grad Max: 0.236063
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033131 | Grad Max: 1.324898
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000288 | Grad Max: 0.010876
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016093 | Grad Max: 0.084801
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000457
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003488 | Grad Max: 0.007454
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000196
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000989 | Grad Max: 0.002320
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001134 | Grad Max: 0.002166
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022470 | Grad Max: 0.022470
[GRADIENT NORM TOTAL] 5.7384

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.692
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50707453 0.49292544] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.546 | Std: 0.056
[MASKS] A(Pass/Fail): 685/1363 | B: 576/1472 | C: 304/1744
[LOSS Ex1] A: 0.65230 | B: 0.64882 | C: 0.64535
[LOGITS Ex2 A] Mean Abs: 1.923 | Max: 6.011
[LOSS Ex2] A: 0.14773 | B: 0.38069 | C: 0.26708
** [JOINT LOSS] ** : 0.913988
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001979 | Grad Max: 0.055798
  -> Layer: shared_layers.0.bias | Grad Mean: 0.046717 | Grad Max: 0.217461
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002143 | Grad Max: 0.005891
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001940 | Grad Max: 0.001940
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000502 | Grad Max: 0.141548
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008125 | Grad Max: 0.797990
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003054
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001950 | Grad Max: 0.017916
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000151
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000312 | Grad Max: 0.001824
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000063
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000083 | Grad Max: 0.000409
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000329 | Grad Max: 0.000944
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001386 | Grad Max: 0.001386
[GRADIENT NORM TOTAL] 1.5581

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.646
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105801 0.4894199] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.546 | Std: 0.055
[MASKS] A(Pass/Fail): 671/1377 | B: 584/1464 | C: 310/1738
[LOSS Ex1] A: 0.64986 | B: 0.64488 | C: 0.64556
[LOGITS Ex2 A] Mean Abs: 1.955 | Max: 6.326
[LOSS Ex2] A: 0.15885 | B: 0.35923 | C: 0.27840
** [JOINT LOSS] ** : 0.912259
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005340 | Grad Max: 0.145862
  -> Layer: shared_layers.0.bias | Grad Mean: 0.305627 | Grad Max: 1.438969
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.006788
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004484 | Grad Max: 0.004484
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.238422
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037281 | Grad Max: 1.265579
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.010728
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017488 | Grad Max: 0.086785
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000452
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003831 | Grad Max: 0.007888
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000224
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001068 | Grad Max: 0.002623
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001157 | Grad Max: 0.002566
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022778 | Grad Max: 0.022778
[GRADIENT NORM TOTAL] 6.1729

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.666
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50264376 0.49735624] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.546 | Std: 0.055
[MASKS] A(Pass/Fail): 672/1376 | B: 538/1318 | C: 243/1133
[LOSS Ex1] A: 0.64885 | B: 0.64872 | C: 0.63785
[LOGITS Ex2 A] Mean Abs: 1.913 | Max: 7.981
[LOSS Ex2] A: 0.16957 | B: 0.36069 | C: 0.30654
** [JOINT LOSS] ** : 0.924074
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004444 | Grad Max: 0.174911
  -> Layer: shared_layers.0.bias | Grad Mean: 0.172737 | Grad Max: 0.872728
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002356 | Grad Max: 0.006703
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004780 | Grad Max: 0.004780
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001243 | Grad Max: 0.149341
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021604 | Grad Max: 0.840314
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.006348
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009057 | Grad Max: 0.049678
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000366
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002066 | Grad Max: 0.005661
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000140
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000580 | Grad Max: 0.001472
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000697 | Grad Max: 0.001667
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013032 | Grad Max: 0.013032
[GRADIENT NORM TOTAL] 3.6014

[EPOCH SUMMARY] Train Loss: 0.9180

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8996 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.9004 -> New: 0.8996)

############################## EPOCH 97/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.552
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037284  0.49627158] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.054
[MASKS] A(Pass/Fail): 651/1397 | B: 576/1472 | C: 312/1736
[LOSS Ex1] A: 0.65593 | B: 0.64783 | C: 0.64488
[LOGITS Ex2 A] Mean Abs: 1.843 | Max: 6.628
[LOSS Ex2] A: 0.14347 | B: 0.38445 | C: 0.26389
** [JOINT LOSS] ** : 0.913483
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003338 | Grad Max: 0.108143
  -> Layer: shared_layers.0.bias | Grad Mean: 0.313096 | Grad Max: 1.486660
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.005695
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000098 | Grad Max: 0.000098
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.282588
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037907 | Grad Max: 1.564606
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000340 | Grad Max: 0.013057
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019375 | Grad Max: 0.108198
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000524
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004170 | Grad Max: 0.009141
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000263
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001146 | Grad Max: 0.002888
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001205 | Grad Max: 0.002852
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023631 | Grad Max: 0.023631
[GRADIENT NORM TOTAL] 6.4766

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.482
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53869236 0.46130767] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.543 | Std: 0.053
[MASKS] A(Pass/Fail): 645/1403 | B: 576/1472 | C: 331/1717
[LOSS Ex1] A: 0.65575 | B: 0.64860 | C: 0.64116
[LOGITS Ex2 A] Mean Abs: 1.813 | Max: 5.848
[LOSS Ex2] A: 0.15663 | B: 0.38761 | C: 0.26605
** [JOINT LOSS] ** : 0.918600
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004905 | Grad Max: 0.134786
  -> Layer: shared_layers.0.bias | Grad Mean: 0.259163 | Grad Max: 1.258257
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005718
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002746 | Grad Max: 0.002746
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001747 | Grad Max: 0.233337
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032362 | Grad Max: 1.320776
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000303 | Grad Max: 0.011393
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016903 | Grad Max: 0.083843
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000428
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003664 | Grad Max: 0.007820
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000266
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001014 | Grad Max: 0.002725
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001128 | Grad Max: 0.002499
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021548 | Grad Max: 0.021548
[GRADIENT NORM TOTAL] 5.1499

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.621
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6924594  0.30754057] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.056
[MASKS] A(Pass/Fail): 699/1349 | B: 590/1458 | C: 375/1673
[LOSS Ex1] A: 0.65083 | B: 0.64467 | C: 0.64370
[LOGITS Ex2 A] Mean Abs: 1.915 | Max: 6.192
[LOSS Ex2] A: 0.14329 | B: 0.35403 | C: 0.29519
** [JOINT LOSS] ** : 0.910570
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002754 | Grad Max: 0.067755
  -> Layer: shared_layers.0.bias | Grad Mean: 0.204390 | Grad Max: 0.834073
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006190
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000409 | Grad Max: 0.000409
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001389 | Grad Max: 0.206644
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025266 | Grad Max: 1.161623
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000192 | Grad Max: 0.007871
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010880 | Grad Max: 0.058610
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000379
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002379 | Grad Max: 0.005593
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000140
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000652 | Grad Max: 0.001609
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000655 | Grad Max: 0.002062
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013389 | Grad Max: 0.013389
[GRADIENT NORM TOTAL] 4.4152

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.695
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006403  0.49935973] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.055
[MASKS] A(Pass/Fail): 690/1358 | B: 542/1314 | C: 316/1732
[LOSS Ex1] A: 0.65643 | B: 0.64853 | C: 0.64514
[LOGITS Ex2 A] Mean Abs: 1.905 | Max: 5.746
[LOSS Ex2] A: 0.15088 | B: 0.34604 | C: 0.27996
** [JOINT LOSS] ** : 0.908993
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002455 | Grad Max: 0.048362
  -> Layer: shared_layers.0.bias | Grad Mean: 0.148792 | Grad Max: 0.546292
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.005286
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001105 | Grad Max: 0.001105
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001031 | Grad Max: 0.184028
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018334 | Grad Max: 1.037607
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.006907
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008051 | Grad Max: 0.056742
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000266
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001673 | Grad Max: 0.004744
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000126
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000463 | Grad Max: 0.001309
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000507 | Grad Max: 0.001710
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009712 | Grad Max: 0.009712
[GRADIENT NORM TOTAL] 3.4489

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.470
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.65524715 0.3447528 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.055
[MASKS] A(Pass/Fail): 673/1375 | B: 578/1470 | C: 322/1726
[LOSS Ex1] A: 0.65310 | B: 0.64764 | C: 0.64370
[LOGITS Ex2 A] Mean Abs: 1.903 | Max: 6.059
[LOSS Ex2] A: 0.15961 | B: 0.37894 | C: 0.28266
** [JOINT LOSS] ** : 0.921878
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003556 | Grad Max: 0.116319
  -> Layer: shared_layers.0.bias | Grad Mean: 0.132386 | Grad Max: 0.602117
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.005982
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000394 | Grad Max: 0.000394
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000941 | Grad Max: 0.340815
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016561 | Grad Max: 1.915663
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000125 | Grad Max: 0.005806
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006545 | Grad Max: 0.034251
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000245
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001505 | Grad Max: 0.003795
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000123
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000425 | Grad Max: 0.001139
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000459 | Grad Max: 0.001263
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008844 | Grad Max: 0.008844
[GRADIENT NORM TOTAL] 3.4825

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.550
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5974096  0.40259036] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.056
[MASKS] A(Pass/Fail): 565/1051 | B: 580/1468 | C: 337/1711
[LOSS Ex1] A: 0.65157 | B: 0.64841 | C: 0.64200
[LOGITS Ex2 A] Mean Abs: 1.937 | Max: 6.831
[LOSS Ex2] A: 0.14212 | B: 0.38007 | C: 0.26246
** [JOINT LOSS] ** : 0.908882
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001682 | Grad Max: 0.035577
  -> Layer: shared_layers.0.bias | Grad Mean: 0.112690 | Grad Max: 0.483143
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.006164
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001428 | Grad Max: 0.001428
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000727 | Grad Max: 0.341326
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013179 | Grad Max: 1.915355
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000102 | Grad Max: 0.004672
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005768 | Grad Max: 0.035713
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000211
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001227 | Grad Max: 0.003428
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000107
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000354 | Grad Max: 0.001059
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000478 | Grad Max: 0.001501
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008358 | Grad Max: 0.008358
[GRADIENT NORM TOTAL] 3.0891

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.697
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070841  0.49291593] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.547 | Std: 0.056
[MASKS] A(Pass/Fail): 685/1363 | B: 591/1457 | C: 314/1734
[LOSS Ex1] A: 0.65201 | B: 0.64448 | C: 0.64433
[LOGITS Ex2 A] Mean Abs: 1.939 | Max: 8.111
[LOSS Ex2] A: 0.15284 | B: 0.35355 | C: 0.29670
** [JOINT LOSS] ** : 0.914631
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003240 | Grad Max: 0.086818
  -> Layer: shared_layers.0.bias | Grad Mean: 0.226650 | Grad Max: 1.141403
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.005634
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000620 | Grad Max: 0.000620
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001465 | Grad Max: 0.191515
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027064 | Grad Max: 1.087110
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000225 | Grad Max: 0.010703
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012718 | Grad Max: 0.090364
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000358
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002730 | Grad Max: 0.006400
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000209
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000745 | Grad Max: 0.002159
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000772 | Grad Max: 0.001980
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015598 | Grad Max: 0.015598
[GRADIENT NORM TOTAL] 4.7794

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.651
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51068187 0.48931813] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.546 | Std: 0.056
[MASKS] A(Pass/Fail): 673/1375 | B: 542/1314 | C: 338/1710
[LOSS Ex1] A: 0.64955 | B: 0.64834 | C: 0.64201
[LOGITS Ex2 A] Mean Abs: 1.910 | Max: 5.529
[LOSS Ex2] A: 0.15557 | B: 0.34976 | C: 0.27447
** [JOINT LOSS] ** : 0.906569
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002984 | Grad Max: 0.078484
  -> Layer: shared_layers.0.bias | Grad Mean: 0.076133 | Grad Max: 0.326454
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002290 | Grad Max: 0.007005
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007214 | Grad Max: 0.007214
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000618 | Grad Max: 0.131492
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010809 | Grad Max: 0.738301
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.003026
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003709 | Grad Max: 0.023413
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000197
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000843 | Grad Max: 0.002748
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000102
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000249 | Grad Max: 0.000885
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000392 | Grad Max: 0.001421
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006213 | Grad Max: 0.006213
[GRADIENT NORM TOTAL] 1.9943

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.672
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026653 0.4973347] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.546 | Std: 0.055
[MASKS] A(Pass/Fail): 672/1376 | B: 579/1469 | C: 313/1735
[LOSS Ex1] A: 0.64854 | B: 0.64744 | C: 0.64273
[LOGITS Ex2 A] Mean Abs: 1.880 | Max: 5.936
[LOSS Ex2] A: 0.16252 | B: 0.39427 | C: 0.28240
** [JOINT LOSS] ** : 0.925967
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003463 | Grad Max: 0.093696
  -> Layer: shared_layers.0.bias | Grad Mean: 0.294893 | Grad Max: 1.308844
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.006309
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000633 | Grad Max: 0.000633
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001831 | Grad Max: 0.240906
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033277 | Grad Max: 1.349881
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000288 | Grad Max: 0.010826
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016137 | Grad Max: 0.091073
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000440
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003300 | Grad Max: 0.007160
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000176
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000912 | Grad Max: 0.002170
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001007 | Grad Max: 0.001922
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020161 | Grad Max: 0.020161
[GRADIENT NORM TOTAL] 5.8132

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.557
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50367177 0.49632826] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.544 | Std: 0.055
[MASKS] A(Pass/Fail): 652/1396 | B: 580/1468 | C: 319/1729
[LOSS Ex1] A: 0.65566 | B: 0.64822 | C: 0.64255
[LOGITS Ex2 A] Mean Abs: 1.871 | Max: 6.925
[LOSS Ex2] A: 0.14036 | B: 0.38470 | C: 0.31055
** [JOINT LOSS] ** : 0.927349
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003455 | Grad Max: 0.096556
  -> Layer: shared_layers.0.bias | Grad Mean: 0.241048 | Grad Max: 0.919750
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.005822
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004793 | Grad Max: 0.004793
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001562 | Grad Max: 0.215744
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029009 | Grad Max: 1.212730
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000271 | Grad Max: 0.009615
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015311 | Grad Max: 0.081188
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000438
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003327 | Grad Max: 0.007461
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000185
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000933 | Grad Max: 0.002145
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001035 | Grad Max: 0.002076
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020617 | Grad Max: 0.020617
[GRADIENT NORM TOTAL] 4.6905

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.486
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53897285 0.46102712] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.544 | Std: 0.053
[MASKS] A(Pass/Fail): 647/1401 | B: 591/1457 | C: 327/1721
[LOSS Ex1] A: 0.65549 | B: 0.64427 | C: 0.64376
[LOGITS Ex2 A] Mean Abs: 1.885 | Max: 6.381
[LOSS Ex2] A: 0.16153 | B: 0.35856 | C: 0.28576
** [JOINT LOSS] ** : 0.916456
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002338 | Grad Max: 0.108712
  -> Layer: shared_layers.0.bias | Grad Mean: 0.255197 | Grad Max: 1.290209
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.006532
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008063 | Grad Max: 0.008063
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001618 | Grad Max: 0.269686
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029784 | Grad Max: 1.502250
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000241 | Grad Max: 0.009678
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013812 | Grad Max: 0.078806
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000383
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002952 | Grad Max: 0.006779
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000169
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000818 | Grad Max: 0.001864
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000776 | Grad Max: 0.001918
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016756 | Grad Max: 0.016756
[GRADIENT NORM TOTAL] 5.5258

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.626
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6940681  0.30593193] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.056
[MASKS] A(Pass/Fail): 699/1349 | B: 542/1314 | C: 348/1700
[LOSS Ex1] A: 0.65054 | B: 0.64814 | C: 0.63919
[LOGITS Ex2 A] Mean Abs: 1.946 | Max: 7.389
[LOSS Ex2] A: 0.15253 | B: 0.35521 | C: 0.26642
** [JOINT LOSS] ** : 0.904011
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002974 | Grad Max: 0.119486
  -> Layer: shared_layers.0.bias | Grad Mean: 0.306689 | Grad Max: 1.467408
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.006942
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007313 | Grad Max: 0.007313
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001965 | Grad Max: 0.284114
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036690 | Grad Max: 1.586659
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.012681
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017982 | Grad Max: 0.104483
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000472
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003842 | Grad Max: 0.007903
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000194
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001079 | Grad Max: 0.002334
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001121 | Grad Max: 0.002731
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023216 | Grad Max: 0.023216
[GRADIENT NORM TOTAL] 6.4874

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.701
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006926  0.49930742] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.055
[MASKS] A(Pass/Fail): 690/1358 | B: 579/1469 | C: 336/1712
[LOSS Ex1] A: 0.65617 | B: 0.64725 | C: 0.64046
[LOGITS Ex2 A] Mean Abs: 1.930 | Max: 5.474
[LOSS Ex2] A: 0.14399 | B: 0.37892 | C: 0.28321
** [JOINT LOSS] ** : 0.916664
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003037 | Grad Max: 0.107179
  -> Layer: shared_layers.0.bias | Grad Mean: 0.063745 | Grad Max: 0.272212
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.005624
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000119 | Grad Max: 0.000119
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000620 | Grad Max: 0.112868
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009691 | Grad Max: 0.626685
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003098
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001794 | Grad Max: 0.015246
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000121
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000329 | Grad Max: 0.001788
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000061
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000095 | Grad Max: 0.000538
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000268 | Grad Max: 0.000946
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002527 | Grad Max: 0.002527
[GRADIENT NORM TOTAL] 1.9461

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.475
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6564448 0.3435552] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.056
[MASKS] A(Pass/Fail): 673/1375 | B: 581/1467 | C: 247/1129
[LOSS Ex1] A: 0.65282 | B: 0.64803 | C: 0.63509
[LOGITS Ex2 A] Mean Abs: 1.900 | Max: 6.110
[LOSS Ex2] A: 0.16380 | B: 0.38917 | C: 0.25571
** [JOINT LOSS] ** : 0.914872
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004107 | Grad Max: 0.114661
  -> Layer: shared_layers.0.bias | Grad Mean: 0.181688 | Grad Max: 0.850236
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.006217
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000080 | Grad Max: 0.000080
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001197 | Grad Max: 0.240624
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021565 | Grad Max: 1.372044
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.006928
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010342 | Grad Max: 0.054509
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000373
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002283 | Grad Max: 0.005553
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000139
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000633 | Grad Max: 0.001596
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000717 | Grad Max: 0.001600
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013728 | Grad Max: 0.013728
[GRADIENT NORM TOTAL] 3.8256

[EPOCH SUMMARY] Train Loss: 0.9149

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8952 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8996 -> New: 0.8952)

############################## EPOCH 98/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.555
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.598105   0.40189496] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.056
[MASKS] A(Pass/Fail): 565/1051 | B: 591/1457 | C: 328/1720
[LOSS Ex1] A: 0.65128 | B: 0.64407 | C: 0.64169
[LOGITS Ex2 A] Mean Abs: 1.961 | Max: 6.226
[LOSS Ex2] A: 0.14329 | B: 0.35356 | C: 0.26171
** [JOINT LOSS] ** : 0.898536
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002602 | Grad Max: 0.104094
  -> Layer: shared_layers.0.bias | Grad Mean: 0.097140 | Grad Max: 0.353158
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.006308
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001441 | Grad Max: 0.001441
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000591 | Grad Max: 0.331074
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009631 | Grad Max: 1.862821
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.002532
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001702 | Grad Max: 0.012283
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000134
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000328 | Grad Max: 0.001748
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000067
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000112 | Grad Max: 0.000528
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000494 | Grad Max: 0.001229
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002109 | Grad Max: 0.002109
[GRADIENT NORM TOTAL] 3.0726

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.703
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50706637 0.49293366] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.547 | Std: 0.056
[MASKS] A(Pass/Fail): 685/1363 | B: 545/1311 | C: 335/1713
[LOSS Ex1] A: 0.65172 | B: 0.64795 | C: 0.64212
[LOGITS Ex2 A] Mean Abs: 1.927 | Max: 7.170
[LOSS Ex2] A: 0.15623 | B: 0.35631 | C: 0.29368
** [JOINT LOSS] ** : 0.916006
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004587 | Grad Max: 0.189319
  -> Layer: shared_layers.0.bias | Grad Mean: 0.126948 | Grad Max: 0.482543
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.006643
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007212 | Grad Max: 0.007212
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000937 | Grad Max: 0.111236
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016235 | Grad Max: 0.618981
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.004574
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006298 | Grad Max: 0.030756
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000289
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001455 | Grad Max: 0.004277
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000134
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000412 | Grad Max: 0.001135
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000520 | Grad Max: 0.001565
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008910 | Grad Max: 0.008910
[GRADIENT NORM TOTAL] 2.6042

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.657
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.510758   0.48924205] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.547 | Std: 0.056
[MASKS] A(Pass/Fail): 674/1374 | B: 579/1469 | C: 348/1700
[LOSS Ex1] A: 0.64925 | B: 0.64705 | C: 0.64194
[LOGITS Ex2 A] Mean Abs: 1.927 | Max: 5.918
[LOSS Ex2] A: 0.14713 | B: 0.37986 | C: 0.29207
** [JOINT LOSS] ** : 0.919098
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002482 | Grad Max: 0.051171
  -> Layer: shared_layers.0.bias | Grad Mean: 0.108573 | Grad Max: 0.457602
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002317 | Grad Max: 0.007003
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007662 | Grad Max: 0.007662
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000764 | Grad Max: 0.144771
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012977 | Grad Max: 0.812322
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.004377
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004676 | Grad Max: 0.029701
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000236
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001002 | Grad Max: 0.003949
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000090
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000286 | Grad Max: 0.000811
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000363 | Grad Max: 0.001072
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006764 | Grad Max: 0.006764
[GRADIENT NORM TOTAL] 2.3478

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.678
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026678  0.49733222] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.546 | Std: 0.056
[MASKS] A(Pass/Fail): 673/1375 | B: 581/1467 | C: 341/1707
[LOSS Ex1] A: 0.64824 | B: 0.64782 | C: 0.64054
[LOGITS Ex2 A] Mean Abs: 1.909 | Max: 5.750
[LOSS Ex2] A: 0.15385 | B: 0.38169 | C: 0.26414
** [JOINT LOSS] ** : 0.912091
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.077411
  -> Layer: shared_layers.0.bias | Grad Mean: 0.098181 | Grad Max: 0.545955
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002244 | Grad Max: 0.006189
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001615 | Grad Max: 0.001615
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000659 | Grad Max: 0.183948
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010956 | Grad Max: 1.029823
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.004784
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002650 | Grad Max: 0.032812
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000191
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000378 | Grad Max: 0.002547
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000067
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000098 | Grad Max: 0.000575
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000302 | Grad Max: 0.000888
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001702 | Grad Max: 0.001702
[GRADIENT NORM TOTAL] 2.3820

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.561
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5035884  0.49641162] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.055
[MASKS] A(Pass/Fail): 652/1396 | B: 591/1457 | C: 358/1690
[LOSS Ex1] A: 0.65540 | B: 0.64385 | C: 0.63732
[LOGITS Ex2 A] Mean Abs: 1.923 | Max: 6.219
[LOSS Ex2] A: 0.14785 | B: 0.36394 | C: 0.26363
** [JOINT LOSS] ** : 0.903993
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003787 | Grad Max: 0.121252
  -> Layer: shared_layers.0.bias | Grad Mean: 0.236102 | Grad Max: 1.311202
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002240 | Grad Max: 0.005679
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002106 | Grad Max: 0.002106
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001526 | Grad Max: 0.197015
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027067 | Grad Max: 1.095376
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.008248
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012480 | Grad Max: 0.069291
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000361
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002544 | Grad Max: 0.005758
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000147
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000699 | Grad Max: 0.001664
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000693 | Grad Max: 0.002251
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015084 | Grad Max: 0.015084
[GRADIENT NORM TOTAL] 4.8406

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.491
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53916943 0.46083054] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.544 | Std: 0.054
[MASKS] A(Pass/Fail): 650/1398 | B: 546/1310 | C: 308/1740
[LOSS Ex1] A: 0.65523 | B: 0.64773 | C: 0.64364
[LOGITS Ex2 A] Mean Abs: 1.872 | Max: 7.241
[LOSS Ex2] A: 0.16472 | B: 0.34831 | C: 0.28146
** [JOINT LOSS] ** : 0.913695
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004327 | Grad Max: 0.179777
  -> Layer: shared_layers.0.bias | Grad Mean: 0.070393 | Grad Max: 0.337605
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005715
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005712 | Grad Max: 0.005712
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000779 | Grad Max: 0.255199
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012363 | Grad Max: 1.430524
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.002708
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002711 | Grad Max: 0.016723
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000201
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000674 | Grad Max: 0.002451
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000196 | Grad Max: 0.000727
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000346 | Grad Max: 0.001259
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004735 | Grad Max: 0.004735
[GRADIENT NORM TOTAL] 2.4282

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.632
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6958346  0.30416542] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.056
[MASKS] A(Pass/Fail): 701/1347 | B: 579/1469 | C: 326/1722
[LOSS Ex1] A: 0.65023 | B: 0.64682 | C: 0.64379
[LOGITS Ex2 A] Mean Abs: 1.927 | Max: 6.695
[LOSS Ex2] A: 0.14623 | B: 0.37875 | C: 0.27549
** [JOINT LOSS] ** : 0.913772
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002444 | Grad Max: 0.079410
  -> Layer: shared_layers.0.bias | Grad Mean: 0.076900 | Grad Max: 0.277646
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.006159
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002450 | Grad Max: 0.002450
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000596 | Grad Max: 0.112218
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009764 | Grad Max: 0.623468
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003508
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002020 | Grad Max: 0.022959
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000148
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000327 | Grad Max: 0.001831
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000058
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000526
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000277 | Grad Max: 0.000843
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001392 | Grad Max: 0.001392
[GRADIENT NORM TOTAL] 1.7739

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.707
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500752 0.499248] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.056
[MASKS] A(Pass/Fail): 691/1357 | B: 581/1467 | C: 328/1720
[LOSS Ex1] A: 0.65588 | B: 0.64759 | C: 0.64169
[LOGITS Ex2 A] Mean Abs: 1.910 | Max: 6.132
[LOSS Ex2] A: 0.14497 | B: 0.36450 | C: 0.26686
** [JOINT LOSS] ** : 0.907168
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004114 | Grad Max: 0.133682
  -> Layer: shared_layers.0.bias | Grad Mean: 0.140510 | Grad Max: 0.533981
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.005531
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003480 | Grad Max: 0.003480
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000997 | Grad Max: 0.170212
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017329 | Grad Max: 0.930708
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000152 | Grad Max: 0.005168
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008378 | Grad Max: 0.038218
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000362
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001929 | Grad Max: 0.005340
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000133
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000542 | Grad Max: 0.001336
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000688 | Grad Max: 0.001906
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012346 | Grad Max: 0.012346
[GRADIENT NORM TOTAL] 2.8018

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.480
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6578215  0.34217852] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.056
[MASKS] A(Pass/Fail): 674/1374 | B: 591/1457 | C: 324/1724
[LOSS Ex1] A: 0.65250 | B: 0.64362 | C: 0.64259
[LOGITS Ex2 A] Mean Abs: 1.931 | Max: 6.341
[LOSS Ex2] A: 0.16779 | B: 0.35156 | C: 0.29391
** [JOINT LOSS] ** : 0.917324
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003797 | Grad Max: 0.152651
  -> Layer: shared_layers.0.bias | Grad Mean: 0.164791 | Grad Max: 0.748913
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.006600
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007894 | Grad Max: 0.007894
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001183 | Grad Max: 0.172921
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021271 | Grad Max: 0.974825
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.006117
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009302 | Grad Max: 0.044900
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000307
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002043 | Grad Max: 0.005410
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000136
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000540 | Grad Max: 0.001585
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000493 | Grad Max: 0.001338
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009815 | Grad Max: 0.009815
[GRADIENT NORM TOTAL] 3.5879

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.561
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5988689  0.40113115] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.057
[MASKS] A(Pass/Fail): 565/1051 | B: 546/1310 | C: 331/1717
[LOSS Ex1] A: 0.65095 | B: 0.64751 | C: 0.64395
[LOGITS Ex2 A] Mean Abs: 1.980 | Max: 6.391
[LOSS Ex2] A: 0.15004 | B: 0.34948 | C: 0.26137
** [JOINT LOSS] ** : 0.901102
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006353 | Grad Max: 0.209249
  -> Layer: shared_layers.0.bias | Grad Mean: 0.126074 | Grad Max: 0.514332
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.006543
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000000 | Grad Max: 0.000000
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001102 | Grad Max: 0.138746
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019205 | Grad Max: 0.645856
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000148 | Grad Max: 0.004496
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007533 | Grad Max: 0.034574
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000264
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001755 | Grad Max: 0.004254
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000138
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000502 | Grad Max: 0.001316
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000624 | Grad Max: 0.001891
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011342 | Grad Max: 0.011342
[GRADIENT NORM TOTAL] 2.8226

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.709
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50702876 0.49297124] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.547 | Std: 0.057
[MASKS] A(Pass/Fail): 685/1363 | B: 581/1467 | C: 345/1703
[LOSS Ex1] A: 0.65138 | B: 0.64660 | C: 0.64096
[LOGITS Ex2 A] Mean Abs: 1.941 | Max: 5.787
[LOSS Ex2] A: 0.14805 | B: 0.39001 | C: 0.29980
** [JOINT LOSS] ** : 0.925596
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003549 | Grad Max: 0.112017
  -> Layer: shared_layers.0.bias | Grad Mean: 0.300744 | Grad Max: 1.526169
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002237 | Grad Max: 0.006227
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005485 | Grad Max: 0.005485
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001969 | Grad Max: 0.217558
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035242 | Grad Max: 1.240974
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000282 | Grad Max: 0.010580
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016063 | Grad Max: 0.095266
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000395
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003273 | Grad Max: 0.007241
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000169
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000892 | Grad Max: 0.002016
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000904 | Grad Max: 0.001951
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019491 | Grad Max: 0.019491
[GRADIENT NORM TOTAL] 6.1862

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.663
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5107572 0.4892428] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.547 | Std: 0.057
[MASKS] A(Pass/Fail): 676/1372 | B: 582/1466 | C: 349/1699
[LOSS Ex1] A: 0.64888 | B: 0.64738 | C: 0.63926
[LOGITS Ex2 A] Mean Abs: 1.922 | Max: 6.370
[LOSS Ex2] A: 0.14823 | B: 0.38854 | C: 0.29717
** [JOINT LOSS] ** : 0.923152
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003125 | Grad Max: 0.155507
  -> Layer: shared_layers.0.bias | Grad Mean: 0.402405 | Grad Max: 1.932973
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002200 | Grad Max: 0.006287
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002028 | Grad Max: 0.002028
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002500 | Grad Max: 0.262708
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046757 | Grad Max: 1.468260
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.014984
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023687 | Grad Max: 0.124427
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000602
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005001 | Grad Max: 0.010213
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000265
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001385 | Grad Max: 0.003135
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001469 | Grad Max: 0.002717
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029935 | Grad Max: 0.029935
[GRADIENT NORM TOTAL] 8.1719

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.685
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026428 0.4973572] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.547 | Std: 0.056
[MASKS] A(Pass/Fail): 673/1375 | B: 592/1456 | C: 323/1725
[LOSS Ex1] A: 0.64787 | B: 0.64340 | C: 0.64080
[LOGITS Ex2 A] Mean Abs: 1.924 | Max: 7.738
[LOSS Ex2] A: 0.15690 | B: 0.35529 | C: 0.26723
** [JOINT LOSS] ** : 0.903825
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002987 | Grad Max: 0.126158
  -> Layer: shared_layers.0.bias | Grad Mean: 0.051928 | Grad Max: 0.218182
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002346 | Grad Max: 0.006702
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001572 | Grad Max: 0.001572
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000579 | Grad Max: 0.118387
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008948 | Grad Max: 0.657061
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000053 | Grad Max: 0.002451
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002009 | Grad Max: 0.015753
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000139
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000459 | Grad Max: 0.001965
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000058
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000134 | Grad Max: 0.000615
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000417 | Grad Max: 0.001300
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002833 | Grad Max: 0.002833
[GRADIENT NORM TOTAL] 1.6051

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.567
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50349283 0.49650717] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.056
[MASKS] A(Pass/Fail): 653/1395 | B: 548/1308 | C: 228/1148
[LOSS Ex1] A: 0.65506 | B: 0.64730 | C: 0.64237
[LOGITS Ex2 A] Mean Abs: 1.932 | Max: 6.943
[LOSS Ex2] A: 0.13854 | B: 0.35965 | C: 0.27434
** [JOINT LOSS] ** : 0.905754
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003110 | Grad Max: 0.095026
  -> Layer: shared_layers.0.bias | Grad Mean: 0.199225 | Grad Max: 1.146479
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005689
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003286 | Grad Max: 0.003286
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001335 | Grad Max: 0.209819
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023845 | Grad Max: 1.161205
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000177 | Grad Max: 0.007931
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009963 | Grad Max: 0.067029
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000271
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002012 | Grad Max: 0.004715
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000137
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000555 | Grad Max: 0.001586
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000556 | Grad Max: 0.001803
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012090 | Grad Max: 0.012090
[GRADIENT NORM TOTAL] 4.5088

[EPOCH SUMMARY] Train Loss: 0.9115

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8969 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 99/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.497
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53936666 0.46063334] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.544 | Std: 0.054
[MASKS] A(Pass/Fail): 650/1398 | B: 583/1465 | C: 346/1702
[LOSS Ex1] A: 0.65489 | B: 0.64639 | C: 0.63863
[LOGITS Ex2 A] Mean Abs: 1.912 | Max: 6.670
[LOSS Ex2] A: 0.16178 | B: 0.37320 | C: 0.28793
** [JOINT LOSS] ** : 0.920942
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004663 | Grad Max: 0.198675
  -> Layer: shared_layers.0.bias | Grad Mean: 0.094583 | Grad Max: 0.422707
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002107 | Grad Max: 0.006250
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005382 | Grad Max: 0.005382
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000893 | Grad Max: 0.146546
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013795 | Grad Max: 0.836719
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000081 | Grad Max: 0.005276
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002836 | Grad Max: 0.031861
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000155
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000335 | Grad Max: 0.001955
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000059
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000497
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000152 | Grad Max: 0.000651
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001144 | Grad Max: 0.001144
[GRADIENT NORM TOTAL] 2.3153

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.639
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6978184 0.3021816] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.057
[MASKS] A(Pass/Fail): 701/1347 | B: 583/1465 | C: 351/1697
[LOSS Ex1] A: 0.64986 | B: 0.64716 | C: 0.64164
[LOGITS Ex2 A] Mean Abs: 1.944 | Max: 6.581
[LOSS Ex2] A: 0.14955 | B: 0.37739 | C: 0.27303
** [JOINT LOSS] ** : 0.912880
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005037 | Grad Max: 0.181466
  -> Layer: shared_layers.0.bias | Grad Mean: 0.180834 | Grad Max: 0.754812
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006118
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003880 | Grad Max: 0.003880
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001211 | Grad Max: 0.398425
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021454 | Grad Max: 2.227343
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000158 | Grad Max: 0.004925
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008558 | Grad Max: 0.043669
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000296
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001998 | Grad Max: 0.005069
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000164
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000551 | Grad Max: 0.001648
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000625 | Grad Max: 0.001714
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011501 | Grad Max: 0.011501
[GRADIENT NORM TOTAL] 4.5192

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.714
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007374  0.49926254] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.056
[MASKS] A(Pass/Fail): 693/1355 | B: 595/1453 | C: 326/1722
[LOSS Ex1] A: 0.65554 | B: 0.64317 | C: 0.64059
[LOGITS Ex2 A] Mean Abs: 1.959 | Max: 5.940
[LOSS Ex2] A: 0.14091 | B: 0.35635 | C: 0.27957
** [JOINT LOSS] ** : 0.905378
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003614 | Grad Max: 0.119542
  -> Layer: shared_layers.0.bias | Grad Mean: 0.060995 | Grad Max: 0.447178
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002226 | Grad Max: 0.005639
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003320 | Grad Max: 0.003320
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000620 | Grad Max: 0.151932
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009743 | Grad Max: 0.858445
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.003630
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002040 | Grad Max: 0.019361
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000149
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000407 | Grad Max: 0.002078
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000117 | Grad Max: 0.000610
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000330 | Grad Max: 0.001124
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002154 | Grad Max: 0.002154
[GRADIENT NORM TOTAL] 1.9225

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.485
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6592902 0.3407098] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.057
[MASKS] A(Pass/Fail): 675/1373 | B: 550/1306 | C: 321/1727
[LOSS Ex1] A: 0.65213 | B: 0.64709 | C: 0.64338
[LOGITS Ex2 A] Mean Abs: 1.971 | Max: 5.883
[LOSS Ex2] A: 0.15423 | B: 0.35066 | C: 0.29177
** [JOINT LOSS] ** : 0.913086
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002341 | Grad Max: 0.053266
  -> Layer: shared_layers.0.bias | Grad Mean: 0.076369 | Grad Max: 0.282901
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002147 | Grad Max: 0.006108
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003131 | Grad Max: 0.003131
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000737 | Grad Max: 0.132429
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013166 | Grad Max: 0.723791
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.003083
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004739 | Grad Max: 0.024761
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000256
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001014 | Grad Max: 0.003109
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000099
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000294 | Grad Max: 0.000847
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000399 | Grad Max: 0.001218
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007132 | Grad Max: 0.007132
[GRADIENT NORM TOTAL] 2.3690

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.567
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.599739 0.400261] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.057
[MASKS] A(Pass/Fail): 565/1051 | B: 589/1459 | C: 322/1726
[LOSS Ex1] A: 0.65057 | B: 0.64616 | C: 0.64483
[LOGITS Ex2 A] Mean Abs: 1.986 | Max: 5.978
[LOSS Ex2] A: 0.14665 | B: 0.37238 | C: 0.29098
** [JOINT LOSS] ** : 0.917190
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004051 | Grad Max: 0.122752
  -> Layer: shared_layers.0.bias | Grad Mean: 0.153748 | Grad Max: 0.727666
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.006138
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005513 | Grad Max: 0.005513
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001088 | Grad Max: 0.221791
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018710 | Grad Max: 1.245827
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000113 | Grad Max: 0.007261
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005757 | Grad Max: 0.056577
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000245
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001020 | Grad Max: 0.003293
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000082
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000287 | Grad Max: 0.000921
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000328 | Grad Max: 0.001027
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007366 | Grad Max: 0.007366
[GRADIENT NORM TOTAL] 3.4932

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.717
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50706446 0.49293554] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.058
[MASKS] A(Pass/Fail): 685/1363 | B: 586/1462 | C: 330/1718
[LOSS Ex1] A: 0.65101 | B: 0.64693 | C: 0.64401
[LOGITS Ex2 A] Mean Abs: 1.958 | Max: 6.533
[LOSS Ex2] A: 0.15694 | B: 0.38455 | C: 0.26710
** [JOINT LOSS] ** : 0.916846
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006526 | Grad Max: 0.287376
  -> Layer: shared_layers.0.bias | Grad Mean: 0.088711 | Grad Max: 0.413574
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.005746
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004214 | Grad Max: 0.004214
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000968 | Grad Max: 0.129035
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014181 | Grad Max: 0.721700
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.003782
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002521 | Grad Max: 0.023948
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000210
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000416 | Grad Max: 0.002139
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000111 | Grad Max: 0.000589
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000224 | Grad Max: 0.000886
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000337 | Grad Max: 0.000337
[GRADIENT NORM TOTAL] 2.3964

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.670
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51074207 0.48925793] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.547 | Std: 0.057
[MASKS] A(Pass/Fail): 677/1371 | B: 595/1453 | C: 331/1717
[LOSS Ex1] A: 0.64848 | B: 0.64291 | C: 0.64228
[LOGITS Ex2 A] Mean Abs: 1.975 | Max: 6.932
[LOSS Ex2] A: 0.15132 | B: 0.35201 | C: 0.26996
** [JOINT LOSS] ** : 0.902322
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005406 | Grad Max: 0.191616
  -> Layer: shared_layers.0.bias | Grad Mean: 0.132959 | Grad Max: 0.629512
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002297 | Grad Max: 0.006617
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002801 | Grad Max: 0.002801
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000998 | Grad Max: 0.130605
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017158 | Grad Max: 0.690229
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.004768
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006669 | Grad Max: 0.037654
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000266
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001554 | Grad Max: 0.004104
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000115
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000420 | Grad Max: 0.001155
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000479 | Grad Max: 0.001513
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008880 | Grad Max: 0.008880
[GRADIENT NORM TOTAL] 2.8333

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.692
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50277233 0.4972276 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.547 | Std: 0.057
[MASKS] A(Pass/Fail): 674/1374 | B: 550/1306 | C: 363/1685
[LOSS Ex1] A: 0.64746 | B: 0.64684 | C: 0.63831
[LOGITS Ex2 A] Mean Abs: 1.939 | Max: 6.780
[LOSS Ex2] A: 0.16346 | B: 0.35414 | C: 0.28170
** [JOINT LOSS] ** : 0.910634
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002954 | Grad Max: 0.074625
  -> Layer: shared_layers.0.bias | Grad Mean: 0.210012 | Grad Max: 0.921174
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.006539
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001501 | Grad Max: 0.001501
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001448 | Grad Max: 0.198511
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026822 | Grad Max: 1.110695
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.011451
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012933 | Grad Max: 0.090793
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000364
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002727 | Grad Max: 0.006249
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000156
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000744 | Grad Max: 0.001710
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000770 | Grad Max: 0.001926
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016072 | Grad Max: 0.016072
[GRADIENT NORM TOTAL] 4.4046

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.573
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.503527 0.496473] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.545 | Std: 0.056
[MASKS] A(Pass/Fail): 654/1394 | B: 589/1459 | C: 355/1693
[LOSS Ex1] A: 0.65471 | B: 0.64591 | C: 0.63892
[LOGITS Ex2 A] Mean Abs: 1.925 | Max: 6.646
[LOSS Ex2] A: 0.14942 | B: 0.37511 | C: 0.25435
** [JOINT LOSS] ** : 0.906142
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004833 | Grad Max: 0.227712
  -> Layer: shared_layers.0.bias | Grad Mean: 0.086222 | Grad Max: 0.421480
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.006065
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006922 | Grad Max: 0.006922
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000907 | Grad Max: 0.084127
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015035 | Grad Max: 0.451856
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000121 | Grad Max: 0.005228
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006341 | Grad Max: 0.031658
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000328
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001551 | Grad Max: 0.004714
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000118
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000434 | Grad Max: 0.001246
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000572 | Grad Max: 0.001593
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009726 | Grad Max: 0.009726
[GRADIENT NORM TOTAL] 2.1604

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.503
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5395455  0.46045455] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.545 | Std: 0.055
[MASKS] A(Pass/Fail): 650/1398 | B: 587/1461 | C: 383/1665
[LOSS Ex1] A: 0.65454 | B: 0.64668 | C: 0.63670
[LOGITS Ex2 A] Mean Abs: 1.942 | Max: 6.635
[LOSS Ex2] A: 0.16367 | B: 0.39300 | C: 0.29746
** [JOINT LOSS] ** : 0.930686
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004982 | Grad Max: 0.213308
  -> Layer: shared_layers.0.bias | Grad Mean: 0.532631 | Grad Max: 2.805570
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.006362
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006043 | Grad Max: 0.006044
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003430 | Grad Max: 0.409364
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.063531 | Grad Max: 2.289667
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000531 | Grad Max: 0.021397
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030777 | Grad Max: 0.182717
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000790
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006349 | Grad Max: 0.013457
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000314
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001695 | Grad Max: 0.003848
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001681 | Grad Max: 0.003270
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033673 | Grad Max: 0.033673
[GRADIENT NORM TOTAL] 11.2906

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.646
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6998182 0.3001818] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.058
[MASKS] A(Pass/Fail): 701/1347 | B: 595/1453 | C: 358/1690
[LOSS Ex1] A: 0.64947 | B: 0.64266 | C: 0.63878
[LOGITS Ex2 A] Mean Abs: 1.996 | Max: 7.641
[LOSS Ex2] A: 0.15528 | B: 0.36899 | C: 0.29220
** [JOINT LOSS] ** : 0.915797
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004637 | Grad Max: 0.250811
  -> Layer: shared_layers.0.bias | Grad Mean: 0.634245 | Grad Max: 2.920288
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002403 | Grad Max: 0.006849
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009956 | Grad Max: 0.009956
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003921 | Grad Max: 0.501487
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.073638 | Grad Max: 2.806187
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000670 | Grad Max: 0.027398
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039082 | Grad Max: 0.243056
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000902
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008150 | Grad Max: 0.016798
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000429
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002213 | Grad Max: 0.005410
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002240 | Grad Max: 0.003898
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046237 | Grad Max: 0.046237
[GRADIENT NORM TOTAL] 12.9200

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.722
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50069064 0.49930933] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.057
[MASKS] A(Pass/Fail): 695/1353 | B: 551/1305 | C: 357/1691
[LOSS Ex1] A: 0.65520 | B: 0.64662 | C: 0.64017
[LOGITS Ex2 A] Mean Abs: 1.960 | Max: 6.061
[LOSS Ex2] A: 0.14758 | B: 0.35546 | C: 0.25005
** [JOINT LOSS] ** : 0.898357
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005335 | Grad Max: 0.178315
  -> Layer: shared_layers.0.bias | Grad Mean: 0.214263 | Grad Max: 0.830343
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002090 | Grad Max: 0.005570
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000108 | Grad Max: 0.000108
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001616 | Grad Max: 0.247969
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027728 | Grad Max: 1.379961
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000219 | Grad Max: 0.009189
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012046 | Grad Max: 0.082772
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000322
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002311 | Grad Max: 0.005716
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000147
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000638 | Grad Max: 0.001952
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000616 | Grad Max: 0.002107
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014136 | Grad Max: 0.014136
[GRADIENT NORM TOTAL] 4.5596

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.490
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6607528  0.33924723] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.057
[MASKS] A(Pass/Fail): 675/1373 | B: 589/1459 | C: 352/1696
[LOSS Ex1] A: 0.65176 | B: 0.64569 | C: 0.63993
[LOGITS Ex2 A] Mean Abs: 1.905 | Max: 6.246
[LOSS Ex2] A: 0.16694 | B: 0.38354 | C: 0.29506
** [JOINT LOSS] ** : 0.927640
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009019 | Grad Max: 0.269181
  -> Layer: shared_layers.0.bias | Grad Mean: 0.451755 | Grad Max: 1.926358
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006223
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001505 | Grad Max: 0.001505
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002963 | Grad Max: 0.483830
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054606 | Grad Max: 2.708512
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000445 | Grad Max: 0.015873
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025394 | Grad Max: 0.135724
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000662
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005583 | Grad Max: 0.011440
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000307
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001510 | Grad Max: 0.003773
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001595 | Grad Max: 0.003014
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031550 | Grad Max: 0.031550
[GRADIENT NORM TOTAL] 9.5256

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.572
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6005669 0.3994331] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.058
[MASKS] A(Pass/Fail): 566/1050 | B: 587/1461 | C: 231/1145
[LOSS Ex1] A: 0.65020 | B: 0.64648 | C: 0.64229
[LOGITS Ex2 A] Mean Abs: 1.943 | Max: 6.436
[LOSS Ex2] A: 0.14924 | B: 0.40213 | C: 0.29281
** [JOINT LOSS] ** : 0.927715
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009603 | Grad Max: 0.301469
  -> Layer: shared_layers.0.bias | Grad Mean: 0.663728 | Grad Max: 2.867584
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.006344
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003636 | Grad Max: 0.003636
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004200 | Grad Max: 0.624758
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.078505 | Grad Max: 3.443604
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000661 | Grad Max: 0.022856
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.038256 | Grad Max: 0.208065
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000840
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008207 | Grad Max: 0.016443
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000430
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002250 | Grad Max: 0.005046
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002377 | Grad Max: 0.004655
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048245 | Grad Max: 0.048245
[GRADIENT NORM TOTAL] 13.6166

[EPOCH SUMMARY] Train Loss: 0.9147

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8970 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 100/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.724
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50714004 0.49286   ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.058
[MASKS] A(Pass/Fail): 686/1362 | B: 595/1453 | C: 376/1672
[LOSS Ex1] A: 0.65066 | B: 0.64246 | C: 0.63743
[LOGITS Ex2 A] Mean Abs: 1.940 | Max: 6.340
[LOSS Ex2] A: 0.14117 | B: 0.36981 | C: 0.28465
** [JOINT LOSS] ** : 0.908725
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005817 | Grad Max: 0.154150
  -> Layer: shared_layers.0.bias | Grad Mean: 0.461543 | Grad Max: 2.078842
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002329 | Grad Max: 0.006028
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000349 | Grad Max: 0.000349
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002873 | Grad Max: 0.450988
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053843 | Grad Max: 2.525108
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000443 | Grad Max: 0.016347
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025759 | Grad Max: 0.146840
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000589
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005449 | Grad Max: 0.010587
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000261
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001478 | Grad Max: 0.003235
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001510 | Grad Max: 0.002667
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030893 | Grad Max: 0.030893
[GRADIENT NORM TOTAL] 9.5671

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.677
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51071805 0.489282  ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.058
[MASKS] A(Pass/Fail): 679/1369 | B: 551/1305 | C: 384/1664
[LOSS Ex1] A: 0.64813 | B: 0.64643 | C: 0.63772
[LOGITS Ex2 A] Mean Abs: 1.966 | Max: 5.603
[LOSS Ex2] A: 0.14941 | B: 0.34913 | C: 0.29525
** [JOINT LOSS] ** : 0.908691
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005582 | Grad Max: 0.184128
  -> Layer: shared_layers.0.bias | Grad Mean: 0.118806 | Grad Max: 0.557535
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002347 | Grad Max: 0.006982
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009199 | Grad Max: 0.009199
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001094 | Grad Max: 0.157771
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018526 | Grad Max: 0.857067
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000161 | Grad Max: 0.005486
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008836 | Grad Max: 0.041172
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000296
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002061 | Grad Max: 0.005135
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000153
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000559 | Grad Max: 0.001712
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000557 | Grad Max: 0.001338
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011206 | Grad Max: 0.011206
[GRADIENT NORM TOTAL] 2.6726

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.698
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5028224 0.4971776] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.057
[MASKS] A(Pass/Fail): 677/1371 | B: 589/1459 | C: 339/1709
[LOSS Ex1] A: 0.64712 | B: 0.64551 | C: 0.64213
[LOGITS Ex2 A] Mean Abs: 1.946 | Max: 8.072
[LOSS Ex2] A: 0.16552 | B: 0.37274 | C: 0.29654
** [JOINT LOSS] ** : 0.923188
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006238 | Grad Max: 0.256179
  -> Layer: shared_layers.0.bias | Grad Mean: 0.224252 | Grad Max: 0.906970
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002293 | Grad Max: 0.006803
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005780 | Grad Max: 0.005780
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001598 | Grad Max: 0.200810
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027485 | Grad Max: 1.121197
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000257 | Grad Max: 0.009542
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014390 | Grad Max: 0.075749
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000455
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003226 | Grad Max: 0.007650
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000195
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000880 | Grad Max: 0.002158
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000871 | Grad Max: 0.001820
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017663 | Grad Max: 0.017663
[GRADIENT NORM TOTAL] 4.5137

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.577
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034805 0.4965195] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.057
[MASKS] A(Pass/Fail): 655/1393 | B: 587/1461 | C: 383/1665
[LOSS Ex1] A: 0.65442 | B: 0.64630 | C: 0.63761
[LOGITS Ex2 A] Mean Abs: 1.893 | Max: 5.919
[LOSS Ex2] A: 0.14406 | B: 0.38353 | C: 0.24818
** [JOINT LOSS] ** : 0.904701
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002543 | Grad Max: 0.070123
  -> Layer: shared_layers.0.bias | Grad Mean: 0.206282 | Grad Max: 0.905786
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002113 | Grad Max: 0.005982
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004674 | Grad Max: 0.004674
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001337 | Grad Max: 0.108417
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024208 | Grad Max: 0.596562
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.008216
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012851 | Grad Max: 0.074537
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000352
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002611 | Grad Max: 0.006445
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000154
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000708 | Grad Max: 0.001698
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000732 | Grad Max: 0.001845
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015154 | Grad Max: 0.015154
[GRADIENT NORM TOTAL] 4.0296

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.508
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5397359  0.46026412] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.545 | Std: 0.055
[MASKS] A(Pass/Fail): 650/1398 | B: 595/1453 | C: 374/1674
[LOSS Ex1] A: 0.65428 | B: 0.64228 | C: 0.63853
[LOGITS Ex2 A] Mean Abs: 1.880 | Max: 6.076
[LOSS Ex2] A: 0.15703 | B: 0.35436 | C: 0.25833
** [JOINT LOSS] ** : 0.901602
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002436 | Grad Max: 0.067971
  -> Layer: shared_layers.0.bias | Grad Mean: 0.119079 | Grad Max: 0.686614
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.006434
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006356 | Grad Max: 0.006356
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000877 | Grad Max: 0.210266
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015702 | Grad Max: 1.169948
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.004807
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007278 | Grad Max: 0.040859
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000241
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001576 | Grad Max: 0.004185
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000124
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000422 | Grad Max: 0.001317
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000444 | Grad Max: 0.001545
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008410 | Grad Max: 0.008410
[GRADIENT NORM TOTAL] 2.7253

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.651
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.70143205 0.29856795] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.058
[MASKS] A(Pass/Fail): 701/1347 | B: 551/1305 | C: 360/1688
[LOSS Ex1] A: 0.64919 | B: 0.64625 | C: 0.64133
[LOGITS Ex2 A] Mean Abs: 1.951 | Max: 6.021
[LOSS Ex2] A: 0.14425 | B: 0.35340 | C: 0.29556
** [JOINT LOSS] ** : 0.909994
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004375 | Grad Max: 0.141982
  -> Layer: shared_layers.0.bias | Grad Mean: 0.360922 | Grad Max: 1.796864
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.006229
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005185 | Grad Max: 0.005185
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002362 | Grad Max: 0.365596
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044099 | Grad Max: 2.035900
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000368 | Grad Max: 0.014247
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021550 | Grad Max: 0.133633
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000515
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004612 | Grad Max: 0.009916
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000229
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001267 | Grad Max: 0.003016
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001303 | Grad Max: 0.002832
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027208 | Grad Max: 0.027208
[GRADIENT NORM TOTAL] 7.7475

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.727
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007449  0.49925512] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.057
[MASKS] A(Pass/Fail): 695/1353 | B: 590/1458 | C: 356/1692
[LOSS Ex1] A: 0.65495 | B: 0.64533 | C: 0.64113
[LOGITS Ex2 A] Mean Abs: 1.968 | Max: 5.586
[LOSS Ex2] A: 0.14968 | B: 0.36760 | C: 0.26440
** [JOINT LOSS] ** : 0.907698
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004321 | Grad Max: 0.155414
  -> Layer: shared_layers.0.bias | Grad Mean: 0.363930 | Grad Max: 1.980787
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.005721
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002159 | Grad Max: 0.002159
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002391 | Grad Max: 0.299329
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042835 | Grad Max: 1.675438
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000353 | Grad Max: 0.014109
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020377 | Grad Max: 0.120519
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000524
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004077 | Grad Max: 0.008853
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000193
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001102 | Grad Max: 0.002574
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001023 | Grad Max: 0.002434
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022918 | Grad Max: 0.022918
[GRADIENT NORM TOTAL] 7.7167

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.495
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6619067  0.33809328] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.058
[MASKS] A(Pass/Fail): 676/1372 | B: 588/1460 | C: 397/1651
[LOSS Ex1] A: 0.65151 | B: 0.64612 | C: 0.63640
[LOGITS Ex2 A] Mean Abs: 1.940 | Max: 6.783
[LOSS Ex2] A: 0.16010 | B: 0.36261 | C: 0.28385
** [JOINT LOSS] ** : 0.913529
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004801 | Grad Max: 0.215230
  -> Layer: shared_layers.0.bias | Grad Mean: 0.111629 | Grad Max: 0.441943
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.005842
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000865 | Grad Max: 0.000865
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001068 | Grad Max: 0.299396
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016923 | Grad Max: 1.670574
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000083 | Grad Max: 0.005645
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002976 | Grad Max: 0.040165
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000177
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000342 | Grad Max: 0.002687
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000059
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000078 | Grad Max: 0.000457
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000261 | Grad Max: 0.000719
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000060 | Grad Max: 0.000060
[GRADIENT NORM TOTAL] 3.1949

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.577
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.60130453 0.3986955 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.058
[MASKS] A(Pass/Fail): 569/1047 | B: 595/1453 | C: 342/1706
[LOSS Ex1] A: 0.64994 | B: 0.64209 | C: 0.64159
[LOGITS Ex2 A] Mean Abs: 1.959 | Max: 6.325
[LOSS Ex2] A: 0.14776 | B: 0.36245 | C: 0.29461
** [JOINT LOSS] ** : 0.912815
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008347 | Grad Max: 0.248791
  -> Layer: shared_layers.0.bias | Grad Mean: 0.437573 | Grad Max: 1.962619
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002224 | Grad Max: 0.006474
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004923 | Grad Max: 0.004923
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003007 | Grad Max: 0.508107
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054959 | Grad Max: 2.834235
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000457 | Grad Max: 0.016484
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026302 | Grad Max: 0.155125
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000575
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005777 | Grad Max: 0.011339
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000290
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001589 | Grad Max: 0.003429
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001692 | Grad Max: 0.003027
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033774 | Grad Max: 0.033774
[GRADIENT NORM TOTAL] 9.3294

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.729
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071095 0.4928905] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.058
[MASKS] A(Pass/Fail): 686/1362 | B: 551/1305 | C: 373/1675
[LOSS Ex1] A: 0.65041 | B: 0.64608 | C: 0.63797
[LOGITS Ex2 A] Mean Abs: 1.932 | Max: 7.262
[LOSS Ex2] A: 0.14570 | B: 0.36462 | C: 0.27459
** [JOINT LOSS] ** : 0.906455
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006228 | Grad Max: 0.218407
  -> Layer: shared_layers.0.bias | Grad Mean: 0.420264 | Grad Max: 1.838217
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.006036
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001649 | Grad Max: 0.001649
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002667 | Grad Max: 0.437925
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049107 | Grad Max: 2.460638
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000415 | Grad Max: 0.014186
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024186 | Grad Max: 0.122641
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000603
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005246 | Grad Max: 0.011709
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000276
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001400 | Grad Max: 0.003477
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001330 | Grad Max: 0.002640
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027646 | Grad Max: 0.027646
[GRADIENT NORM TOTAL] 8.4343

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.682
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108116  0.48918837] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.058
[MASKS] A(Pass/Fail): 679/1369 | B: 590/1458 | C: 382/1666
[LOSS Ex1] A: 0.64785 | B: 0.64517 | C: 0.64249
[LOGITS Ex2 A] Mean Abs: 1.947 | Max: 5.656
[LOSS Ex2] A: 0.15268 | B: 0.37899 | C: 0.27146
** [JOINT LOSS] ** : 0.912879
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004128 | Grad Max: 0.128807
  -> Layer: shared_layers.0.bias | Grad Mean: 0.080290 | Grad Max: 0.315624
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.005938
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003415 | Grad Max: 0.003415
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000685 | Grad Max: 0.192103
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010979 | Grad Max: 1.033822
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.003707
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001889 | Grad Max: 0.017395
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000155
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000366 | Grad Max: 0.002167
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000064
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000093 | Grad Max: 0.000508
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000188 | Grad Max: 0.000691
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001119 | Grad Max: 0.001119
[GRADIENT NORM TOTAL] 2.2742

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.704
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50283486 0.49716514] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.058
[MASKS] A(Pass/Fail): 678/1370 | B: 590/1458 | C: 360/1688
[LOSS Ex1] A: 0.64685 | B: 0.64596 | C: 0.64174
[LOGITS Ex2 A] Mean Abs: 1.939 | Max: 7.114
[LOSS Ex2] A: 0.17289 | B: 0.38300 | C: 0.27349
** [JOINT LOSS] ** : 0.921309
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008342 | Grad Max: 0.278815
  -> Layer: shared_layers.0.bias | Grad Mean: 0.396617 | Grad Max: 1.675713
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002283 | Grad Max: 0.006872
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006483 | Grad Max: 0.006483
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002701 | Grad Max: 0.338292
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048470 | Grad Max: 1.885007
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000410 | Grad Max: 0.013876
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023412 | Grad Max: 0.127691
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000552
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005215 | Grad Max: 0.010615
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000274
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001431 | Grad Max: 0.003205
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001404 | Grad Max: 0.002933
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029218 | Grad Max: 0.029218
[GRADIENT NORM TOTAL] 7.9446

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.582
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034557  0.49654427] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.057
[MASKS] A(Pass/Fail): 655/1393 | B: 595/1453 | C: 365/1683
[LOSS Ex1] A: 0.65418 | B: 0.64193 | C: 0.64105
[LOGITS Ex2 A] Mean Abs: 1.919 | Max: 5.973
[LOSS Ex2] A: 0.14167 | B: 0.36094 | C: 0.27664
** [JOINT LOSS] ** : 0.905466
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004224 | Grad Max: 0.131551
  -> Layer: shared_layers.0.bias | Grad Mean: 0.234346 | Grad Max: 0.858160
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.006135
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006884 | Grad Max: 0.006884
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001514 | Grad Max: 0.226866
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027523 | Grad Max: 1.133134
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000209 | Grad Max: 0.008571
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012065 | Grad Max: 0.075100
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000346
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002647 | Grad Max: 0.006221
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000129
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000727 | Grad Max: 0.001596
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000755 | Grad Max: 0.002177
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015737 | Grad Max: 0.015737
[GRADIENT NORM TOTAL] 4.7485

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.512
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.539877   0.46012303] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.545 | Std: 0.055
[MASKS] A(Pass/Fail): 652/1396 | B: 553/1303 | C: 220/1156
[LOSS Ex1] A: 0.65403 | B: 0.64593 | C: 0.64352
[LOGITS Ex2 A] Mean Abs: 1.856 | Max: 6.060
[LOSS Ex2] A: 0.15246 | B: 0.35642 | C: 0.26574
** [JOINT LOSS] ** : 0.906031
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003294 | Grad Max: 0.063640
  -> Layer: shared_layers.0.bias | Grad Mean: 0.219714 | Grad Max: 1.069928
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002027 | Grad Max: 0.005892
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005910 | Grad Max: 0.005910
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001510 | Grad Max: 0.192377
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028070 | Grad Max: 1.096393
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000248 | Grad Max: 0.009411
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014274 | Grad Max: 0.076426
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000377
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002964 | Grad Max: 0.006729
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000185
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000764 | Grad Max: 0.002234
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000683 | Grad Max: 0.002056
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014071 | Grad Max: 0.014071
[GRADIENT NORM TOTAL] 4.6629

[EPOCH SUMMARY] Train Loss: 0.9102

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8902 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8952 -> New: 0.8902)

############################## EPOCH 101/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.656
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.70274216 0.2972578 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.058
[MASKS] A(Pass/Fail): 703/1345 | B: 591/1457 | C: 380/1668
[LOSS Ex1] A: 0.64893 | B: 0.64500 | C: 0.63880
[LOGITS Ex2 A] Mean Abs: 1.891 | Max: 6.458
[LOSS Ex2] A: 0.14451 | B: 0.37709 | C: 0.26814
** [JOINT LOSS] ** : 0.907494
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.038686
  -> Layer: shared_layers.0.bias | Grad Mean: 0.139843 | Grad Max: 0.564764
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.006667
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008075 | Grad Max: 0.008075
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001035 | Grad Max: 0.135715
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018849 | Grad Max: 0.747233
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000158 | Grad Max: 0.005727
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009201 | Grad Max: 0.049766
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000307
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001927 | Grad Max: 0.005181
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000138
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000519 | Grad Max: 0.001473
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000485 | Grad Max: 0.001434
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010272 | Grad Max: 0.010272
[GRADIENT NORM TOTAL] 3.1320

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.733
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50068337 0.49931663] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.057
[MASKS] A(Pass/Fail): 696/1352 | B: 592/1456 | C: 359/1689
[LOSS Ex1] A: 0.65472 | B: 0.64580 | C: 0.64034
[LOGITS Ex2 A] Mean Abs: 1.931 | Max: 5.809
[LOSS Ex2] A: 0.14463 | B: 0.37103 | C: 0.26920
** [JOINT LOSS] ** : 0.908571
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003137 | Grad Max: 0.127879
  -> Layer: shared_layers.0.bias | Grad Mean: 0.290398 | Grad Max: 1.575827
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.005470
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003401 | Grad Max: 0.003401
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001836 | Grad Max: 0.282147
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033828 | Grad Max: 1.582695
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000260 | Grad Max: 0.011429
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015207 | Grad Max: 0.103402
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000389
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003094 | Grad Max: 0.006896
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000164
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000828 | Grad Max: 0.002091
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000706 | Grad Max: 0.002442
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016183 | Grad Max: 0.016183
[GRADIENT NORM TOTAL] 6.3822

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.498
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.66274685 0.3372532 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.058
[MASKS] A(Pass/Fail): 677/1371 | B: 595/1453 | C: 359/1689
[LOSS Ex1] A: 0.65125 | B: 0.64176 | C: 0.64380
[LOGITS Ex2 A] Mean Abs: 1.931 | Max: 5.747
[LOSS Ex2] A: 0.15702 | B: 0.35995 | C: 0.27701
** [JOINT LOSS] ** : 0.910263
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003006 | Grad Max: 0.108121
  -> Layer: shared_layers.0.bias | Grad Mean: 0.265343 | Grad Max: 1.436825
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.005576
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001805 | Grad Max: 0.001805
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001794 | Grad Max: 0.260526
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032448 | Grad Max: 1.461302
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000243 | Grad Max: 0.012524
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014200 | Grad Max: 0.103645
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000336
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002928 | Grad Max: 0.006610
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000191
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000782 | Grad Max: 0.002388
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000668 | Grad Max: 0.001799
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015594 | Grad Max: 0.015594
[GRADIENT NORM TOTAL] 6.0342

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.581
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6017169  0.39828318] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.058
[MASKS] A(Pass/Fail): 569/1047 | B: 554/1302 | C: 399/1649
[LOSS Ex1] A: 0.64969 | B: 0.64577 | C: 0.63676
[LOGITS Ex2 A] Mean Abs: 1.952 | Max: 6.438
[LOSS Ex2] A: 0.14697 | B: 0.35226 | C: 0.27164
** [JOINT LOSS] ** : 0.901029
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003471 | Grad Max: 0.106191
  -> Layer: shared_layers.0.bias | Grad Mean: 0.128970 | Grad Max: 0.522312
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002162 | Grad Max: 0.006500
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003893 | Grad Max: 0.003893
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000900 | Grad Max: 0.078952
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016111 | Grad Max: 0.434302
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000145 | Grad Max: 0.005744
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008104 | Grad Max: 0.042461
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000278
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001731 | Grad Max: 0.004800
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000127
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000464 | Grad Max: 0.001373
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000516 | Grad Max: 0.001580
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009354 | Grad Max: 0.009354
[GRADIENT NORM TOTAL] 2.4529

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.735
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071762  0.49282378] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.059
[MASKS] A(Pass/Fail): 686/1362 | B: 591/1457 | C: 406/1642
[LOSS Ex1] A: 0.65016 | B: 0.64484 | C: 0.63508
[LOGITS Ex2 A] Mean Abs: 1.956 | Max: 6.693
[LOSS Ex2] A: 0.14687 | B: 0.36545 | C: 0.28073
** [JOINT LOSS] ** : 0.907712
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002720 | Grad Max: 0.112867
  -> Layer: shared_layers.0.bias | Grad Mean: 0.107929 | Grad Max: 0.497126
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.005987
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002305 | Grad Max: 0.002305
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000859 | Grad Max: 0.329640
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014823 | Grad Max: 1.834348
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.004455
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004589 | Grad Max: 0.030420
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000204
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000881 | Grad Max: 0.003300
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000076
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000229 | Grad Max: 0.000807
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000320 | Grad Max: 0.001102
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004064 | Grad Max: 0.004064
[GRADIENT NORM TOTAL] 3.2317

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.687
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51071924 0.48928076] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.058
[MASKS] A(Pass/Fail): 681/1367 | B: 594/1454 | C: 385/1663
[LOSS Ex1] A: 0.64759 | B: 0.64563 | C: 0.64256
[LOGITS Ex2 A] Mean Abs: 1.942 | Max: 6.360
[LOSS Ex2] A: 0.15014 | B: 0.37294 | C: 0.28961
** [JOINT LOSS] ** : 0.916158
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002145 | Grad Max: 0.049300
  -> Layer: shared_layers.0.bias | Grad Mean: 0.115825 | Grad Max: 0.533249
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006356
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005648 | Grad Max: 0.005648
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000948 | Grad Max: 0.344423
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016944 | Grad Max: 1.925452
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000109 | Grad Max: 0.005679
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006159 | Grad Max: 0.043075
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000269
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001306 | Grad Max: 0.004492
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000109
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000339 | Grad Max: 0.001188
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000318 | Grad Max: 0.001107
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006022 | Grad Max: 0.006022
[GRADIENT NORM TOTAL] 3.6133

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.709
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029533 0.4970467] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.058
[MASKS] A(Pass/Fail): 680/1368 | B: 595/1453 | C: 371/1677
[LOSS Ex1] A: 0.64659 | B: 0.64158 | C: 0.64221
[LOGITS Ex2 A] Mean Abs: 1.935 | Max: 6.623
[LOSS Ex2] A: 0.16042 | B: 0.35340 | C: 0.27470
** [JOINT LOSS] ** : 0.906298
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002587 | Grad Max: 0.085797
  -> Layer: shared_layers.0.bias | Grad Mean: 0.149927 | Grad Max: 0.670079
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006240
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002283 | Grad Max: 0.002283
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001037 | Grad Max: 0.147198
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018021 | Grad Max: 0.798465
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.007346
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006822 | Grad Max: 0.055697
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000240
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001298 | Grad Max: 0.003437
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000096
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000344 | Grad Max: 0.001097
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000369 | Grad Max: 0.001239
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007417 | Grad Max: 0.007417
[GRADIENT NORM TOTAL] 3.1952

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.586
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034871  0.49651292] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.057
[MASKS] A(Pass/Fail): 656/1392 | B: 554/1302 | C: 383/1665
[LOSS Ex1] A: 0.65394 | B: 0.64559 | C: 0.64041
[LOGITS Ex2 A] Mean Abs: 1.909 | Max: 6.737
[LOSS Ex2] A: 0.14201 | B: 0.35409 | C: 0.29118
** [JOINT LOSS] ** : 0.909077
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002930 | Grad Max: 0.083788
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134515 | Grad Max: 0.609364
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.005554
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003716 | Grad Max: 0.003716
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000921 | Grad Max: 0.159714
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015876 | Grad Max: 0.823228
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.006371
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007089 | Grad Max: 0.045934
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000231
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001389 | Grad Max: 0.003869
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000371 | Grad Max: 0.001113
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000397 | Grad Max: 0.001205
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008225 | Grad Max: 0.008225
[GRADIENT NORM TOTAL] 2.7973

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.516
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.53990114 0.46009883] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.545 | Std: 0.056
[MASKS] A(Pass/Fail): 652/1396 | B: 592/1456 | C: 395/1653
[LOSS Ex1] A: 0.65380 | B: 0.64465 | C: 0.63627
[LOGITS Ex2 A] Mean Abs: 1.926 | Max: 6.484
[LOSS Ex2] A: 0.15404 | B: 0.37569 | C: 0.28011
** [JOINT LOSS] ** : 0.914856
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004926 | Grad Max: 0.136862
  -> Layer: shared_layers.0.bias | Grad Mean: 0.258351 | Grad Max: 1.206683
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.007129
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011910 | Grad Max: 0.011910
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001720 | Grad Max: 0.231437
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031807 | Grad Max: 1.284966
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000260 | Grad Max: 0.008544
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015002 | Grad Max: 0.078891
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000390
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003206 | Grad Max: 0.007101
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000168
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000848 | Grad Max: 0.001860
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000820 | Grad Max: 0.001774
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016701 | Grad Max: 0.016701
[GRADIENT NORM TOTAL] 5.2784

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.662
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7040736  0.29592642] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.059
[MASKS] A(Pass/Fail): 703/1345 | B: 594/1454 | C: 398/1650
[LOSS Ex1] A: 0.64868 | B: 0.64544 | C: 0.63728
[LOGITS Ex2 A] Mean Abs: 1.974 | Max: 6.504
[LOSS Ex2] A: 0.14851 | B: 0.37791 | C: 0.25443
** [JOINT LOSS] ** : 0.904079
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004681 | Grad Max: 0.124899
  -> Layer: shared_layers.0.bias | Grad Mean: 0.330295 | Grad Max: 1.586409
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006208
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004013 | Grad Max: 0.004013
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.292576
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039293 | Grad Max: 1.636535
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000335 | Grad Max: 0.012158
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019593 | Grad Max: 0.110384
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000501
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004196 | Grad Max: 0.009201
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000236
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001146 | Grad Max: 0.002798
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001119 | Grad Max: 0.002717
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023706 | Grad Max: 0.023706
[GRADIENT NORM TOTAL] 6.8266

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.739
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50065154 0.49934843] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.058
[MASKS] A(Pass/Fail): 696/1352 | B: 595/1453 | C: 356/1692
[LOSS Ex1] A: 0.65448 | B: 0.64138 | C: 0.63899
[LOGITS Ex2 A] Mean Abs: 1.954 | Max: 6.826
[LOSS Ex2] A: 0.14369 | B: 0.35858 | C: 0.25730
** [JOINT LOSS] ** : 0.898141
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003970 | Grad Max: 0.182386
  -> Layer: shared_layers.0.bias | Grad Mean: 0.120905 | Grad Max: 0.456346
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.005667
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000603 | Grad Max: 0.000603
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000923 | Grad Max: 0.171880
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015292 | Grad Max: 0.978465
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000106 | Grad Max: 0.004983
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005605 | Grad Max: 0.036028
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000270
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001345 | Grad Max: 0.003922
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000108
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000356 | Grad Max: 0.001253
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000421 | Grad Max: 0.001480
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006942 | Grad Max: 0.006942
[GRADIENT NORM TOTAL] 2.6815

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.502
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6638131  0.33618686] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.058
[MASKS] A(Pass/Fail): 677/1371 | B: 554/1302 | C: 383/1665
[LOSS Ex1] A: 0.65099 | B: 0.64540 | C: 0.63851
[LOGITS Ex2 A] Mean Abs: 1.931 | Max: 5.797
[LOSS Ex2] A: 0.15783 | B: 0.36422 | C: 0.26529
** [JOINT LOSS] ** : 0.907412
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005356 | Grad Max: 0.194325
  -> Layer: shared_layers.0.bias | Grad Mean: 0.259038 | Grad Max: 0.958494
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.006303
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002764 | Grad Max: 0.002764
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001707 | Grad Max: 0.183730
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030736 | Grad Max: 1.001492
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000267 | Grad Max: 0.009383
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015314 | Grad Max: 0.077703
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000429
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003385 | Grad Max: 0.008195
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000188
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000909 | Grad Max: 0.002174
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000848 | Grad Max: 0.002154
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017606 | Grad Max: 0.017606
[GRADIENT NORM TOTAL] 4.8950

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.585
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.602245 0.397755] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.059
[MASKS] A(Pass/Fail): 570/1046 | B: 596/1452 | C: 357/1691
[LOSS Ex1] A: 0.64941 | B: 0.64447 | C: 0.64184
[LOGITS Ex2 A] Mean Abs: 1.983 | Max: 5.726
[LOSS Ex2] A: 0.14821 | B: 0.36864 | C: 0.27467
** [JOINT LOSS] ** : 0.909075
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.065982
  -> Layer: shared_layers.0.bias | Grad Mean: 0.137886 | Grad Max: 0.579281
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.006389
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002317 | Grad Max: 0.002317
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000924 | Grad Max: 0.174097
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016586 | Grad Max: 0.966999
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000130 | Grad Max: 0.005308
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007459 | Grad Max: 0.043230
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000257
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001525 | Grad Max: 0.004498
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000104
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000405 | Grad Max: 0.001089
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000388 | Grad Max: 0.001473
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008379 | Grad Max: 0.008379
[GRADIENT NORM TOTAL] 3.0259

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.741
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072345 0.4927655] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.059
[MASKS] A(Pass/Fail): 688/1360 | B: 596/1452 | C: 274/1102
[LOSS Ex1] A: 0.64989 | B: 0.64525 | C: 0.63692
[LOGITS Ex2 A] Mean Abs: 1.956 | Max: 7.626
[LOSS Ex2] A: 0.14213 | B: 0.37338 | C: 0.28134
** [JOINT LOSS] ** : 0.909636
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003248 | Grad Max: 0.127789
  -> Layer: shared_layers.0.bias | Grad Mean: 0.136944 | Grad Max: 0.504707
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.005965
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001403 | Grad Max: 0.001403
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001018 | Grad Max: 0.168720
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017558 | Grad Max: 0.928505
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000118 | Grad Max: 0.005404
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006590 | Grad Max: 0.040196
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000274
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001530 | Grad Max: 0.004586
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000111
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000405 | Grad Max: 0.001054
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.001484
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007652 | Grad Max: 0.007652
[GRADIENT NORM TOTAL] 3.1810

[EPOCH SUMMARY] Train Loss: 0.9078

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8880 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8902 -> New: 0.8880)

############################## EPOCH 102/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.692
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51075923 0.48924074] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.059
[MASKS] A(Pass/Fail): 685/1363 | B: 598/1450 | C: 367/1681
[LOSS Ex1] A: 0.64729 | B: 0.64119 | C: 0.64147
[LOGITS Ex2 A] Mean Abs: 1.932 | Max: 6.682
[LOSS Ex2] A: 0.14265 | B: 0.36212 | C: 0.26600
** [JOINT LOSS] ** : 0.900240
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.057704
  -> Layer: shared_layers.0.bias | Grad Mean: 0.150876 | Grad Max: 0.621492
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002419 | Grad Max: 0.007047
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012567 | Grad Max: 0.012567
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000992 | Grad Max: 0.214372
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017511 | Grad Max: 1.188504
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000133 | Grad Max: 0.005275
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007751 | Grad Max: 0.043650
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000254
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001573 | Grad Max: 0.004595
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000413 | Grad Max: 0.001206
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000387 | Grad Max: 0.001313
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008202 | Grad Max: 0.008202
[GRADIENT NORM TOTAL] 3.1507

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.715
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029694  0.49703065] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.058
[MASKS] A(Pass/Fail): 681/1367 | B: 557/1299 | C: 389/1659
[LOSS Ex1] A: 0.64629 | B: 0.64521 | C: 0.63635
[LOGITS Ex2 A] Mean Abs: 1.923 | Max: 6.244
[LOSS Ex2] A: 0.15846 | B: 0.35505 | C: 0.26837
** [JOINT LOSS] ** : 0.903240
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004401 | Grad Max: 0.211652
  -> Layer: shared_layers.0.bias | Grad Mean: 0.150731 | Grad Max: 0.560755
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.006685
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003456 | Grad Max: 0.003456
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001185 | Grad Max: 0.151749
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019503 | Grad Max: 0.838555
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000138 | Grad Max: 0.005385
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007116 | Grad Max: 0.046132
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000201
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001236 | Grad Max: 0.004012
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000099
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000316 | Grad Max: 0.000972
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000329 | Grad Max: 0.001171
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006797 | Grad Max: 0.006797
[GRADIENT NORM TOTAL] 3.1374

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.591
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.503421   0.49657896] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.546 | Std: 0.058
[MASKS] A(Pass/Fail): 656/1392 | B: 597/1451 | C: 389/1659
[LOSS Ex1] A: 0.65368 | B: 0.64427 | C: 0.63928
[LOGITS Ex2 A] Mean Abs: 1.942 | Max: 6.737
[LOSS Ex2] A: 0.14375 | B: 0.36669 | C: 0.27994
** [JOINT LOSS] ** : 0.909200
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003472 | Grad Max: 0.092154
  -> Layer: shared_layers.0.bias | Grad Mean: 0.217693 | Grad Max: 1.070309
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.006082
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005410 | Grad Max: 0.005411
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001389 | Grad Max: 0.165155
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025495 | Grad Max: 0.914001
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000214 | Grad Max: 0.010089
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012501 | Grad Max: 0.083132
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000319
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002642 | Grad Max: 0.006034
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000158
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000703 | Grad Max: 0.001707
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000674 | Grad Max: 0.001752
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013790 | Grad Max: 0.013790
[GRADIENT NORM TOTAL] 4.3515

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.521
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5401172 0.4598828] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.546 | Std: 0.056
[MASKS] A(Pass/Fail): 654/1394 | B: 596/1452 | C: 352/1696
[LOSS Ex1] A: 0.65354 | B: 0.64506 | C: 0.64049
[LOGITS Ex2 A] Mean Abs: 1.907 | Max: 7.297
[LOSS Ex2] A: 0.15080 | B: 0.36545 | C: 0.26290
** [JOINT LOSS] ** : 0.906076
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002323 | Grad Max: 0.052785
  -> Layer: shared_layers.0.bias | Grad Mean: 0.118251 | Grad Max: 0.627693
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.006291
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007738 | Grad Max: 0.007738
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000972 | Grad Max: 0.315568
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017421 | Grad Max: 1.763107
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.005061
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005610 | Grad Max: 0.038316
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000200
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001093 | Grad Max: 0.003177
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000082
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000290 | Grad Max: 0.000908
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000420 | Grad Max: 0.001290
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005694 | Grad Max: 0.005694
[GRADIENT NORM TOTAL] 3.6159

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.667
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7057205  0.29427952] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.059
[MASKS] A(Pass/Fail): 705/1343 | B: 599/1449 | C: 393/1655
[LOSS Ex1] A: 0.64838 | B: 0.64098 | C: 0.64035
[LOGITS Ex2 A] Mean Abs: 1.947 | Max: 5.915
[LOSS Ex2] A: 0.14079 | B: 0.35017 | C: 0.29085
** [JOINT LOSS] ** : 0.903841
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004685 | Grad Max: 0.130553
  -> Layer: shared_layers.0.bias | Grad Mean: 0.197897 | Grad Max: 0.966388
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.006042
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001527 | Grad Max: 0.001527
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001245 | Grad Max: 0.389328
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022473 | Grad Max: 2.174060
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000165 | Grad Max: 0.004661
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009534 | Grad Max: 0.042430
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000294
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002164 | Grad Max: 0.005115
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000123
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000609 | Grad Max: 0.001509
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000685 | Grad Max: 0.001629
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013044 | Grad Max: 0.013044
[GRADIENT NORM TOTAL] 4.6607

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.745
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50069124 0.4993088 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.058
[MASKS] A(Pass/Fail): 696/1352 | B: 558/1298 | C: 406/1642
[LOSS Ex1] A: 0.65421 | B: 0.64502 | C: 0.63765
[LOGITS Ex2 A] Mean Abs: 1.946 | Max: 5.481
[LOSS Ex2] A: 0.13448 | B: 0.34876 | C: 0.27595
** [JOINT LOSS] ** : 0.898690
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005944 | Grad Max: 0.213114
  -> Layer: shared_layers.0.bias | Grad Mean: 0.145051 | Grad Max: 0.563141
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002147 | Grad Max: 0.005859
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005584 | Grad Max: 0.005584
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001103 | Grad Max: 0.184625
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018790 | Grad Max: 1.040065
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000150 | Grad Max: 0.005950
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008269 | Grad Max: 0.041985
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000302
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001968 | Grad Max: 0.005086
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000121
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000537 | Grad Max: 0.001340
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000641 | Grad Max: 0.001684
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011161 | Grad Max: 0.011161
[GRADIENT NORM TOTAL] 3.0996

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.506
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6649985  0.33500153] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.059
[MASKS] A(Pass/Fail): 679/1369 | B: 599/1449 | C: 370/1678
[LOSS Ex1] A: 0.65070 | B: 0.64408 | C: 0.63885
[LOGITS Ex2 A] Mean Abs: 1.978 | Max: 6.287
[LOSS Ex2] A: 0.16039 | B: 0.36822 | C: 0.27675
** [JOINT LOSS] ** : 0.912999
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002830 | Grad Max: 0.118146
  -> Layer: shared_layers.0.bias | Grad Mean: 0.262147 | Grad Max: 1.452426
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.005812
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002593 | Grad Max: 0.002593
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001681 | Grad Max: 0.225177
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030313 | Grad Max: 1.259806
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000241 | Grad Max: 0.009159
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014337 | Grad Max: 0.077252
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000370
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002948 | Grad Max: 0.006972
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000159
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000792 | Grad Max: 0.001782
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000736 | Grad Max: 0.001864
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015946 | Grad Max: 0.015946
[GRADIENT NORM TOTAL] 5.6206

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.591
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6028875 0.3971125] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.059
[MASKS] A(Pass/Fail): 571/1045 | B: 596/1452 | C: 386/1662
[LOSS Ex1] A: 0.64912 | B: 0.64486 | C: 0.63745
[LOGITS Ex2 A] Mean Abs: 2.026 | Max: 5.637
[LOSS Ex2] A: 0.14448 | B: 0.36928 | C: 0.29259
** [JOINT LOSS] ** : 0.912592
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003281 | Grad Max: 0.111977
  -> Layer: shared_layers.0.bias | Grad Mean: 0.259810 | Grad Max: 1.460370
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.007363
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011635 | Grad Max: 0.011635
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001676 | Grad Max: 0.238693
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030947 | Grad Max: 1.329581
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000238 | Grad Max: 0.009488
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014061 | Grad Max: 0.076562
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000370
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003004 | Grad Max: 0.007365
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000180
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000792 | Grad Max: 0.001974
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000688 | Grad Max: 0.001690
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015019 | Grad Max: 0.015019
[GRADIENT NORM TOTAL] 5.6018

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.747
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50719315 0.4928069 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.060
[MASKS] A(Pass/Fail): 688/1360 | B: 599/1449 | C: 387/1661
[LOSS Ex1] A: 0.64960 | B: 0.64078 | C: 0.63962
[LOGITS Ex2 A] Mean Abs: 1.977 | Max: 6.453
[LOSS Ex2] A: 0.13588 | B: 0.35103 | C: 0.27466
** [JOINT LOSS] ** : 0.897193
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.076278
  -> Layer: shared_layers.0.bias | Grad Mean: 0.058168 | Grad Max: 0.365573
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002324 | Grad Max: 0.006440
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008998 | Grad Max: 0.008998
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000540 | Grad Max: 0.102542
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008568 | Grad Max: 0.570350
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.003281
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001913 | Grad Max: 0.016460
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000145
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000292 | Grad Max: 0.001889
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000049
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000080 | Grad Max: 0.000404
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000370 | Grad Max: 0.000852
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001007 | Grad Max: 0.001008
[GRADIENT NORM TOTAL] 1.6182

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.699
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5107295 0.4892705] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.059
[MASKS] A(Pass/Fail): 685/1363 | B: 562/1294 | C: 389/1659
[LOSS Ex1] A: 0.64699 | B: 0.64482 | C: 0.63766
[LOGITS Ex2 A] Mean Abs: 1.968 | Max: 6.306
[LOSS Ex2] A: 0.15022 | B: 0.35787 | C: 0.28439
** [JOINT LOSS] ** : 0.907317
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002475 | Grad Max: 0.092894
  -> Layer: shared_layers.0.bias | Grad Mean: 0.289832 | Grad Max: 1.308128
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002357 | Grad Max: 0.006933
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010784 | Grad Max: 0.010784
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001835 | Grad Max: 0.270818
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033500 | Grad Max: 1.533499
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000272 | Grad Max: 0.008661
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016221 | Grad Max: 0.078715
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000362
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003385 | Grad Max: 0.007425
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000192
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000912 | Grad Max: 0.002377
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000884 | Grad Max: 0.001740
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018793 | Grad Max: 0.018793
[GRADIENT NORM TOTAL] 6.0356

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.721
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029926 0.4970075] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.059
[MASKS] A(Pass/Fail): 681/1367 | B: 601/1447 | C: 381/1667
[LOSS Ex1] A: 0.64599 | B: 0.64387 | C: 0.64104
[LOGITS Ex2 A] Mean Abs: 1.946 | Max: 6.509
[LOSS Ex2] A: 0.16317 | B: 0.37254 | C: 0.26890
** [JOINT LOSS] ** : 0.911834
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004198 | Grad Max: 0.159009
  -> Layer: shared_layers.0.bias | Grad Mean: 0.166618 | Grad Max: 0.805899
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006132
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002876 | Grad Max: 0.002876
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001223 | Grad Max: 0.179237
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020725 | Grad Max: 0.983743
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000136 | Grad Max: 0.006416
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007177 | Grad Max: 0.047912
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000197
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001282 | Grad Max: 0.003457
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000086
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000347 | Grad Max: 0.000983
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000337 | Grad Max: 0.001212
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007611 | Grad Max: 0.007611
[GRADIENT NORM TOTAL] 3.7205

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.596
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5033652  0.49663472] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.058
[MASKS] A(Pass/Fail): 656/1392 | B: 596/1452 | C: 397/1651
[LOSS Ex1] A: 0.65341 | B: 0.64464 | C: 0.63768
[LOGITS Ex2 A] Mean Abs: 1.973 | Max: 6.014
[LOSS Ex2] A: 0.14457 | B: 0.36320 | C: 0.26705
** [JOINT LOSS] ** : 0.903513
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004667 | Grad Max: 0.165724
  -> Layer: shared_layers.0.bias | Grad Mean: 0.311302 | Grad Max: 1.239429
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.006191
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009281 | Grad Max: 0.009281
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.258932
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037417 | Grad Max: 1.352906
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000301 | Grad Max: 0.010277
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017693 | Grad Max: 0.097316
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000413
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003746 | Grad Max: 0.007739
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000205
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001007 | Grad Max: 0.002382
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001013 | Grad Max: 0.002500
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020760 | Grad Max: 0.020760
[GRADIENT NORM TOTAL] 6.3133

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.526
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5402258  0.45977423] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.546 | Std: 0.057
[MASKS] A(Pass/Fail): 655/1393 | B: 602/1446 | C: 394/1654
[LOSS Ex1] A: 0.65328 | B: 0.64054 | C: 0.63582
[LOGITS Ex2 A] Mean Abs: 1.953 | Max: 6.100
[LOSS Ex2] A: 0.15073 | B: 0.34671 | C: 0.26770
** [JOINT LOSS] ** : 0.898262
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005095 | Grad Max: 0.155621
  -> Layer: shared_layers.0.bias | Grad Mean: 0.302388 | Grad Max: 1.264640
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002227 | Grad Max: 0.006490
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008039 | Grad Max: 0.008039
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001891 | Grad Max: 0.279816
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034770 | Grad Max: 1.565925
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000278 | Grad Max: 0.010086
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016451 | Grad Max: 0.097131
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000447
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003492 | Grad Max: 0.008180
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000194
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000948 | Grad Max: 0.002222
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000978 | Grad Max: 0.002398
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020091 | Grad Max: 0.020091
[GRADIENT NORM TOTAL] 6.0364

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.672
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.70726484 0.2927352 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.059
[MASKS] A(Pass/Fail): 705/1343 | B: 565/1291 | C: 266/1110
[LOSS Ex1] A: 0.64810 | B: 0.64459 | C: 0.63629
[LOGITS Ex2 A] Mean Abs: 1.970 | Max: 6.173
[LOSS Ex2] A: 0.14260 | B: 0.35212 | C: 0.27431
** [JOINT LOSS] ** : 0.899335
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002973 | Grad Max: 0.112155
  -> Layer: shared_layers.0.bias | Grad Mean: 0.132821 | Grad Max: 0.566331
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002257 | Grad Max: 0.006189
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004689 | Grad Max: 0.004689
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001041 | Grad Max: 0.358536
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018461 | Grad Max: 2.006120
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.006851
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007170 | Grad Max: 0.054539
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000266
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001626 | Grad Max: 0.004293
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000107
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000449 | Grad Max: 0.001206
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000439 | Grad Max: 0.001245
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009509 | Grad Max: 0.009509
[GRADIENT NORM TOTAL] 3.6678

[EPOCH SUMMARY] Train Loss: 0.9046

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8896 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 103/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.750
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50072    0.49927992] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.059
[MASKS] A(Pass/Fail): 696/1352 | B: 601/1447 | C: 390/1658
[LOSS Ex1] A: 0.65397 | B: 0.64364 | C: 0.63669
[LOGITS Ex2 A] Mean Abs: 1.978 | Max: 5.963
[LOSS Ex2] A: 0.14136 | B: 0.37331 | C: 0.26805
** [JOINT LOSS] ** : 0.905669
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005188 | Grad Max: 0.219298
  -> Layer: shared_layers.0.bias | Grad Mean: 0.229160 | Grad Max: 1.057505
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002145 | Grad Max: 0.005716
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001392 | Grad Max: 0.001392
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001711 | Grad Max: 0.395138
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029703 | Grad Max: 2.218769
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000238 | Grad Max: 0.007632
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013732 | Grad Max: 0.063056
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000382
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003161 | Grad Max: 0.006686
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000178
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000866 | Grad Max: 0.001895
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000930 | Grad Max: 0.001767
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018332 | Grad Max: 0.018332
[GRADIENT NORM TOTAL] 5.2681

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.511
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6662289  0.33377114] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.059
[MASKS] A(Pass/Fail): 680/1368 | B: 597/1451 | C: 405/1643
[LOSS Ex1] A: 0.65044 | B: 0.64442 | C: 0.63570
[LOGITS Ex2 A] Mean Abs: 1.988 | Max: 6.119
[LOSS Ex2] A: 0.15996 | B: 0.36915 | C: 0.27335
** [JOINT LOSS] ** : 0.911005
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004463 | Grad Max: 0.138877
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134711 | Grad Max: 0.716225
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.006167
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005090 | Grad Max: 0.005090
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000950 | Grad Max: 0.153516
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015125 | Grad Max: 0.853783
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000069 | Grad Max: 0.003578
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002549 | Grad Max: 0.025396
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000156
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000334 | Grad Max: 0.002053
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000067
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000532
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000182 | Grad Max: 0.000769
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000786 | Grad Max: 0.000786
[GRADIENT NORM TOTAL] 2.9057

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.596
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6035885  0.39641148] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.060
[MASKS] A(Pass/Fail): 572/1044 | B: 601/1447 | C: 394/1654
[LOSS Ex1] A: 0.64885 | B: 0.64033 | C: 0.64003
[LOGITS Ex2 A] Mean Abs: 2.039 | Max: 6.038
[LOSS Ex2] A: 0.14186 | B: 0.35085 | C: 0.27799
** [JOINT LOSS] ** : 0.899971
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002835 | Grad Max: 0.123176
  -> Layer: shared_layers.0.bias | Grad Mean: 0.277488 | Grad Max: 1.442962
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002196 | Grad Max: 0.006205
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002332 | Grad Max: 0.002332
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001847 | Grad Max: 0.257997
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034112 | Grad Max: 1.437707
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000272 | Grad Max: 0.008907
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016353 | Grad Max: 0.076895
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000382
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003430 | Grad Max: 0.007625
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000182
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000914 | Grad Max: 0.002052
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000823 | Grad Max: 0.002196
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017896 | Grad Max: 0.017896
[GRADIENT NORM TOTAL] 5.9448

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.752
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071637  0.49283633] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.060
[MASKS] A(Pass/Fail): 688/1360 | B: 567/1289 | C: 393/1655
[LOSS Ex1] A: 0.64933 | B: 0.64438 | C: 0.63894
[LOGITS Ex2 A] Mean Abs: 1.983 | Max: 6.919
[LOSS Ex2] A: 0.15272 | B: 0.34066 | C: 0.27086
** [JOINT LOSS] ** : 0.898962
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003426 | Grad Max: 0.114550
  -> Layer: shared_layers.0.bias | Grad Mean: 0.079115 | Grad Max: 0.366179
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.005753
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000625 | Grad Max: 0.000625
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000743 | Grad Max: 0.156935
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011933 | Grad Max: 0.864668
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.004067
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002268 | Grad Max: 0.031933
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000176
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000307 | Grad Max: 0.001756
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000053
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000080 | Grad Max: 0.000504
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000323 | Grad Max: 0.000783
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000405 | Grad Max: 0.000405
[GRADIENT NORM TOTAL] 2.3473

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.704
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108048  0.48919517] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.060
[MASKS] A(Pass/Fail): 686/1362 | B: 601/1447 | C: 359/1689
[LOSS Ex1] A: 0.64670 | B: 0.64344 | C: 0.64020
[LOGITS Ex2 A] Mean Abs: 1.982 | Max: 6.067
[LOSS Ex2] A: 0.14536 | B: 0.37028 | C: 0.28133
** [JOINT LOSS] ** : 0.909107
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002923 | Grad Max: 0.068328
  -> Layer: shared_layers.0.bias | Grad Mean: 0.152453 | Grad Max: 0.895642
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002331 | Grad Max: 0.006859
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009981 | Grad Max: 0.009981
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001048 | Grad Max: 0.251468
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018348 | Grad Max: 1.420308
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000094 | Grad Max: 0.005849
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005035 | Grad Max: 0.044751
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000181
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000965 | Grad Max: 0.002972
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000085
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000258 | Grad Max: 0.000780
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000260 | Grad Max: 0.000965
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005330 | Grad Max: 0.005330
[GRADIENT NORM TOTAL] 3.8771

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.727
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029448  0.49705514] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.059
[MASKS] A(Pass/Fail): 682/1366 | B: 597/1451 | C: 393/1655
[LOSS Ex1] A: 0.64569 | B: 0.64422 | C: 0.63854
[LOGITS Ex2 A] Mean Abs: 1.960 | Max: 6.686
[LOSS Ex2] A: 0.16570 | B: 0.36607 | C: 0.26022
** [JOINT LOSS] ** : 0.906814
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003706 | Grad Max: 0.173327
  -> Layer: shared_layers.0.bias | Grad Mean: 0.056491 | Grad Max: 0.200452
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002215 | Grad Max: 0.006378
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001037 | Grad Max: 0.001037
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000658 | Grad Max: 0.132926
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009710 | Grad Max: 0.668221
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.004304
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002088 | Grad Max: 0.024815
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000172
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000301 | Grad Max: 0.001671
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000052
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000078 | Grad Max: 0.000488
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000274 | Grad Max: 0.000718
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000416 | Grad Max: 0.000416
[GRADIENT NORM TOTAL] 1.7678

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.600
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50325227 0.49674776] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.059
[MASKS] A(Pass/Fail): 658/1390 | B: 601/1447 | C: 378/1670
[LOSS Ex1] A: 0.65316 | B: 0.64011 | C: 0.63757
[LOGITS Ex2 A] Mean Abs: 1.939 | Max: 7.053
[LOSS Ex2] A: 0.14079 | B: 0.33969 | C: 0.28114
** [JOINT LOSS] ** : 0.897484
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002263 | Grad Max: 0.069403
  -> Layer: shared_layers.0.bias | Grad Mean: 0.152091 | Grad Max: 0.864043
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.006006
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004968 | Grad Max: 0.004968
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000971 | Grad Max: 0.210445
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017594 | Grad Max: 1.179626
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000118 | Grad Max: 0.005105
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006911 | Grad Max: 0.043056
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000226
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001459 | Grad Max: 0.003874
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000113
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000391 | Grad Max: 0.001110
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000416 | Grad Max: 0.001528
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007625 | Grad Max: 0.007625
[GRADIENT NORM TOTAL] 3.4867

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.531
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54053634 0.45946366] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.546 | Std: 0.057
[MASKS] A(Pass/Fail): 655/1393 | B: 567/1289 | C: 421/1627
[LOSS Ex1] A: 0.65303 | B: 0.64416 | C: 0.63811
[LOGITS Ex2 A] Mean Abs: 1.939 | Max: 6.828
[LOSS Ex2] A: 0.15037 | B: 0.34318 | C: 0.27800
** [JOINT LOSS] ** : 0.902278
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005490 | Grad Max: 0.235252
  -> Layer: shared_layers.0.bias | Grad Mean: 0.092015 | Grad Max: 0.540924
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005360
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003172 | Grad Max: 0.003172
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000888 | Grad Max: 0.108699
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014374 | Grad Max: 0.551466
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000092 | Grad Max: 0.004696
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004320 | Grad Max: 0.031101
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000259
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001052 | Grad Max: 0.003524
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000096
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000271 | Grad Max: 0.000854
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000352 | Grad Max: 0.001104
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005302 | Grad Max: 0.005302
[GRADIENT NORM TOTAL] 2.3113

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.678
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7090002  0.29099977] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.060
[MASKS] A(Pass/Fail): 706/1342 | B: 601/1447 | C: 389/1659
[LOSS Ex1] A: 0.64780 | B: 0.64321 | C: 0.63832
[LOGITS Ex2 A] Mean Abs: 1.972 | Max: 6.666
[LOSS Ex2] A: 0.14128 | B: 0.37028 | C: 0.29045
** [JOINT LOSS] ** : 0.910443
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002924 | Grad Max: 0.089796
  -> Layer: shared_layers.0.bias | Grad Mean: 0.055570 | Grad Max: 0.349306
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002321 | Grad Max: 0.006718
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010544 | Grad Max: 0.010544
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000686 | Grad Max: 0.193945
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011150 | Grad Max: 1.097914
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.004884
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002403 | Grad Max: 0.031624
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000145
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000345 | Grad Max: 0.002020
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000064
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000091 | Grad Max: 0.000429
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000204 | Grad Max: 0.000661
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001009 | Grad Max: 0.001009
[GRADIENT NORM TOTAL] 2.1235

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.756
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008049 0.4991951] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.059
[MASKS] A(Pass/Fail): 696/1352 | B: 597/1451 | C: 411/1637
[LOSS Ex1] A: 0.65370 | B: 0.64397 | C: 0.63513
[LOGITS Ex2 A] Mean Abs: 1.992 | Max: 5.746
[LOSS Ex2] A: 0.13689 | B: 0.36674 | C: 0.28135
** [JOINT LOSS] ** : 0.905926
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002462 | Grad Max: 0.069731
  -> Layer: shared_layers.0.bias | Grad Mean: 0.128123 | Grad Max: 0.485318
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.005973
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004295 | Grad Max: 0.004295
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000752 | Grad Max: 0.396218
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012876 | Grad Max: 2.218232
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.002739
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002092 | Grad Max: 0.019925
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000130
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000490 | Grad Max: 0.002518
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000066
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000156 | Grad Max: 0.000587
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000315 | Grad Max: 0.001142
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004663 | Grad Max: 0.004663
[GRADIENT NORM TOTAL] 3.7820

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.516
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.667576   0.33242396] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.060
[MASKS] A(Pass/Fail): 680/1368 | B: 601/1447 | C: 414/1634
[LOSS Ex1] A: 0.65013 | B: 0.63985 | C: 0.63603
[LOGITS Ex2 A] Mean Abs: 1.976 | Max: 5.900
[LOSS Ex2] A: 0.15768 | B: 0.35488 | C: 0.26650
** [JOINT LOSS] ** : 0.901693
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002270 | Grad Max: 0.069968
  -> Layer: shared_layers.0.bias | Grad Mean: 0.080019 | Grad Max: 0.330233
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002259 | Grad Max: 0.005960
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003209 | Grad Max: 0.003209
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000623 | Grad Max: 0.136639
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010687 | Grad Max: 0.751850
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.005665
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003851 | Grad Max: 0.031288
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000192
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000774 | Grad Max: 0.002874
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000082
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000208 | Grad Max: 0.000858
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000282 | Grad Max: 0.001092
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004069 | Grad Max: 0.004069
[GRADIENT NORM TOTAL] 2.0871

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.602
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6043871  0.39561287] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.060
[MASKS] A(Pass/Fail): 572/1044 | B: 567/1289 | C: 429/1619
[LOSS Ex1] A: 0.64853 | B: 0.64390 | C: 0.63693
[LOGITS Ex2 A] Mean Abs: 2.009 | Max: 6.914
[LOSS Ex2] A: 0.13935 | B: 0.35484 | C: 0.26107
** [JOINT LOSS] ** : 0.894871
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002907 | Grad Max: 0.070416
  -> Layer: shared_layers.0.bias | Grad Mean: 0.199217 | Grad Max: 0.857371
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.006989
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010613 | Grad Max: 0.010613
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001344 | Grad Max: 0.207113
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024207 | Grad Max: 1.152167
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000207 | Grad Max: 0.008929
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012354 | Grad Max: 0.075131
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000356
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002506 | Grad Max: 0.006893
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000144
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000670 | Grad Max: 0.001659
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000597 | Grad Max: 0.001726
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013272 | Grad Max: 0.013272
[GRADIENT NORM TOTAL] 4.2563

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.758
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50708926 0.49291074] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.061
[MASKS] A(Pass/Fail): 688/1360 | B: 601/1447 | C: 403/1645
[LOSS Ex1] A: 0.64901 | B: 0.64295 | C: 0.63949
[LOGITS Ex2 A] Mean Abs: 1.994 | Max: 6.992
[LOSS Ex2] A: 0.14263 | B: 0.36589 | C: 0.25640
** [JOINT LOSS] ** : 0.898787
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005063 | Grad Max: 0.202942
  -> Layer: shared_layers.0.bias | Grad Mean: 0.091441 | Grad Max: 0.534830
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.005812
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000213 | Grad Max: 0.000213
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000849 | Grad Max: 0.130432
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013742 | Grad Max: 0.692998
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.005572
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004234 | Grad Max: 0.025086
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000304
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001021 | Grad Max: 0.003928
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000101
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000288 | Grad Max: 0.000896
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000393 | Grad Max: 0.001421
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006169 | Grad Max: 0.006169
[GRADIENT NORM TOTAL] 2.2044

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.710
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108839  0.48911604] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.060
[MASKS] A(Pass/Fail): 687/1361 | B: 598/1450 | C: 286/1090
[LOSS Ex1] A: 0.64634 | B: 0.64372 | C: 0.63967
[LOGITS Ex2 A] Mean Abs: 1.962 | Max: 6.217
[LOSS Ex2] A: 0.14570 | B: 0.36469 | C: 0.27514
** [JOINT LOSS] ** : 0.905085
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002823 | Grad Max: 0.066891
  -> Layer: shared_layers.0.bias | Grad Mean: 0.057118 | Grad Max: 0.313494
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002285 | Grad Max: 0.006415
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007699 | Grad Max: 0.007699
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000655 | Grad Max: 0.164286
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010558 | Grad Max: 0.930329
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.004057
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001955 | Grad Max: 0.026105
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000169
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000297 | Grad Max: 0.001982
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000060
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000078 | Grad Max: 0.000514
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000270 | Grad Max: 0.000914
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000512 | Grad Max: 0.000512
[GRADIENT NORM TOTAL] 1.9672

[EPOCH SUMMARY] Train Loss: 0.9034

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8840 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8880 -> New: 0.8840)

############################## EPOCH 104/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.734
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.502902 0.497098] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.060
[MASKS] A(Pass/Fail): 682/1366 | B: 602/1446 | C: 405/1643
[LOSS Ex1] A: 0.64533 | B: 0.63959 | C: 0.63750
[LOGITS Ex2 A] Mean Abs: 1.953 | Max: 7.330
[LOSS Ex2] A: 0.15402 | B: 0.35331 | C: 0.27128
** [JOINT LOSS] ** : 0.900344
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003099 | Grad Max: 0.087777
  -> Layer: shared_layers.0.bias | Grad Mean: 0.092034 | Grad Max: 0.359698
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002300 | Grad Max: 0.006389
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000040 | Grad Max: 0.000040
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000675 | Grad Max: 0.095445
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012206 | Grad Max: 0.535767
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000080 | Grad Max: 0.004242
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004247 | Grad Max: 0.031215
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000159
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000880 | Grad Max: 0.003413
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000086
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000238 | Grad Max: 0.000825
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000309 | Grad Max: 0.001109
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004860 | Grad Max: 0.004860
[GRADIENT NORM TOTAL] 2.0599

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.606
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5031084  0.49689165] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.547 | Std: 0.059
[MASKS] A(Pass/Fail): 658/1390 | B: 567/1289 | C: 390/1658
[LOSS Ex1] A: 0.65284 | B: 0.64363 | C: 0.64317
[LOGITS Ex2 A] Mean Abs: 1.947 | Max: 5.971
[LOSS Ex2] A: 0.14324 | B: 0.34613 | C: 0.27993
** [JOINT LOSS] ** : 0.902981
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003929 | Grad Max: 0.161977
  -> Layer: shared_layers.0.bias | Grad Mean: 0.146415 | Grad Max: 0.910827
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.005469
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004495 | Grad Max: 0.004495
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001140 | Grad Max: 0.184266
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019408 | Grad Max: 1.031261
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.006530
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005082 | Grad Max: 0.060516
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000195
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000879 | Grad Max: 0.003408
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000086
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000229 | Grad Max: 0.000872
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000304 | Grad Max: 0.000975
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004648 | Grad Max: 0.004648
[GRADIENT NORM TOTAL] 3.5023

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.537
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5408156 0.4591844] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.547 | Std: 0.057
[MASKS] A(Pass/Fail): 657/1391 | B: 601/1447 | C: 406/1642
[LOSS Ex1] A: 0.65271 | B: 0.64267 | C: 0.63959
[LOGITS Ex2 A] Mean Abs: 1.951 | Max: 6.587
[LOSS Ex2] A: 0.15244 | B: 0.37201 | C: 0.28489
** [JOINT LOSS] ** : 0.914773
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004154 | Grad Max: 0.167241
  -> Layer: shared_layers.0.bias | Grad Mean: 0.173882 | Grad Max: 0.794187
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.005944
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007523 | Grad Max: 0.007523
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001322 | Grad Max: 0.197952
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022387 | Grad Max: 1.102213
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000145 | Grad Max: 0.007074
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007755 | Grad Max: 0.054545
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000239
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001359 | Grad Max: 0.004852
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000099
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000337 | Grad Max: 0.001061
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000278 | Grad Max: 0.001029
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006293 | Grad Max: 0.006293
[GRADIENT NORM TOTAL] 3.9320

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.685
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.71106994 0.2889301 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.061
[MASKS] A(Pass/Fail): 706/1342 | B: 601/1447 | C: 425/1623
[LOSS Ex1] A: 0.64743 | B: 0.64344 | C: 0.63393
[LOGITS Ex2 A] Mean Abs: 1.971 | Max: 6.002
[LOSS Ex2] A: 0.14182 | B: 0.37578 | C: 0.26150
** [JOINT LOSS] ** : 0.901300
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003642 | Grad Max: 0.104679
  -> Layer: shared_layers.0.bias | Grad Mean: 0.191379 | Grad Max: 0.888431
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002270 | Grad Max: 0.006157
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003866 | Grad Max: 0.003866
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001263 | Grad Max: 0.395372
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022607 | Grad Max: 2.204231
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.006067
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009875 | Grad Max: 0.052292
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000305
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002214 | Grad Max: 0.005122
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000141
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000599 | Grad Max: 0.001378
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000661 | Grad Max: 0.001566
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012605 | Grad Max: 0.012605
[GRADIENT NORM TOTAL] 4.5130

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.763
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008894  0.49911052] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.060
[MASKS] A(Pass/Fail): 696/1352 | B: 602/1446 | C: 404/1644
[LOSS Ex1] A: 0.65337 | B: 0.63931 | C: 0.63764
[LOGITS Ex2 A] Mean Abs: 1.970 | Max: 6.229
[LOSS Ex2] A: 0.13149 | B: 0.35117 | C: 0.25214
** [JOINT LOSS] ** : 0.888379
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003039 | Grad Max: 0.087151
  -> Layer: shared_layers.0.bias | Grad Mean: 0.120192 | Grad Max: 0.511599
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002245 | Grad Max: 0.005838
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007384 | Grad Max: 0.007384
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000747 | Grad Max: 0.376162
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012557 | Grad Max: 2.111840
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.003745
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003044 | Grad Max: 0.025813
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000181
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000718 | Grad Max: 0.003045
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000083
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000186 | Grad Max: 0.000838
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000489 | Grad Max: 0.001330
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003225 | Grad Max: 0.003225
[GRADIENT NORM TOTAL] 3.5166

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.521
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.66910523 0.33089474] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.060
[MASKS] A(Pass/Fail): 680/1368 | B: 567/1289 | C: 422/1626
[LOSS Ex1] A: 0.64978 | B: 0.64337 | C: 0.63635
[LOGITS Ex2 A] Mean Abs: 1.982 | Max: 5.633
[LOSS Ex2] A: 0.15079 | B: 0.34647 | C: 0.25988
** [JOINT LOSS] ** : 0.895542
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002289 | Grad Max: 0.053555
  -> Layer: shared_layers.0.bias | Grad Mean: 0.088992 | Grad Max: 0.366005
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.005891
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000374 | Grad Max: 0.000374
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000794 | Grad Max: 0.133179
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013852 | Grad Max: 0.739968
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000106 | Grad Max: 0.004627
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006271 | Grad Max: 0.032488
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000257
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001386 | Grad Max: 0.004014
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000106
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000387 | Grad Max: 0.001123
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000466 | Grad Max: 0.001632
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009312 | Grad Max: 0.009312
[GRADIENT NORM TOTAL] 2.1900

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.608
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.605253   0.39474696] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.061
[MASKS] A(Pass/Fail): 574/1042 | B: 601/1447 | C: 408/1640
[LOSS Ex1] A: 0.64817 | B: 0.64241 | C: 0.63716
[LOGITS Ex2 A] Mean Abs: 2.013 | Max: 5.872
[LOSS Ex2] A: 0.13761 | B: 0.37463 | C: 0.26603
** [JOINT LOSS] ** : 0.902007
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002982 | Grad Max: 0.071280
  -> Layer: shared_layers.0.bias | Grad Mean: 0.201082 | Grad Max: 1.006298
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006224
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003989 | Grad Max: 0.003989
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001343 | Grad Max: 0.209530
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024124 | Grad Max: 1.171321
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000161 | Grad Max: 0.007359
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009625 | Grad Max: 0.058842
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000304
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001941 | Grad Max: 0.005090
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000128
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000524 | Grad Max: 0.001431
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000518 | Grad Max: 0.001409
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011007 | Grad Max: 0.011007
[GRADIENT NORM TOTAL] 4.4694

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.765
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50704044 0.49295953] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.061
[MASKS] A(Pass/Fail): 689/1359 | B: 601/1447 | C: 427/1621
[LOSS Ex1] A: 0.64865 | B: 0.64318 | C: 0.63420
[LOGITS Ex2 A] Mean Abs: 1.995 | Max: 7.835
[LOSS Ex2] A: 0.13577 | B: 0.36923 | C: 0.28137
** [JOINT LOSS] ** : 0.904134
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002968 | Grad Max: 0.134296
  -> Layer: shared_layers.0.bias | Grad Mean: 0.075863 | Grad Max: 0.246819
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002179 | Grad Max: 0.005887
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002618 | Grad Max: 0.002618
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000645 | Grad Max: 0.214752
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010206 | Grad Max: 1.150349
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.002713
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001882 | Grad Max: 0.016636
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000153
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000330 | Grad Max: 0.002075
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000064
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000091 | Grad Max: 0.000444
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000232 | Grad Max: 0.000825
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001329 | Grad Max: 0.001329
[GRADIENT NORM TOTAL] 2.2321

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.718
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5109122 0.4890878] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.061
[MASKS] A(Pass/Fail): 688/1360 | B: 602/1446 | C: 393/1655
[LOSS Ex1] A: 0.64596 | B: 0.63905 | C: 0.64058
[LOGITS Ex2 A] Mean Abs: 2.006 | Max: 5.911
[LOSS Ex2] A: 0.13924 | B: 0.34992 | C: 0.27839
** [JOINT LOSS] ** : 0.897711
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003370 | Grad Max: 0.068272
  -> Layer: shared_layers.0.bias | Grad Mean: 0.168987 | Grad Max: 0.850505
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002297 | Grad Max: 0.006476
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006598 | Grad Max: 0.006598
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001215 | Grad Max: 0.215851
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021921 | Grad Max: 1.206809
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000159 | Grad Max: 0.006874
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009358 | Grad Max: 0.055348
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000293
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002034 | Grad Max: 0.005283
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000123
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000543 | Grad Max: 0.001404
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000559 | Grad Max: 0.001507
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011172 | Grad Max: 0.011172
[GRADIENT NORM TOTAL] 3.7387

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.741
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50292873 0.49707127] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.060
[MASKS] A(Pass/Fail): 684/1364 | B: 567/1289 | C: 422/1626
[LOSS Ex1] A: 0.64495 | B: 0.64311 | C: 0.63211
[LOGITS Ex2 A] Mean Abs: 1.963 | Max: 7.008
[LOSS Ex2] A: 0.15270 | B: 0.34867 | C: 0.24313
** [JOINT LOSS] ** : 0.888222
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003121 | Grad Max: 0.077247
  -> Layer: shared_layers.0.bias | Grad Mean: 0.155333 | Grad Max: 0.595657
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002313 | Grad Max: 0.006604
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001280 | Grad Max: 0.001280
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001087 | Grad Max: 0.099529
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019096 | Grad Max: 0.548450
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.008283
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009777 | Grad Max: 0.070187
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000300
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001948 | Grad Max: 0.005215
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000117
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000500 | Grad Max: 0.001349
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000438 | Grad Max: 0.001458
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008657 | Grad Max: 0.008657
[GRADIENT NORM TOTAL] 3.0962

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.613
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50301236 0.49698767] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.060
[MASKS] A(Pass/Fail): 660/1388 | B: 601/1447 | C: 434/1614
[LOSS Ex1] A: 0.65251 | B: 0.64215 | C: 0.63375
[LOGITS Ex2 A] Mean Abs: 1.960 | Max: 5.906
[LOSS Ex2] A: 0.14764 | B: 0.37423 | C: 0.28112
** [JOINT LOSS] ** : 0.910465
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001942 | Grad Max: 0.041045
  -> Layer: shared_layers.0.bias | Grad Mean: 0.082610 | Grad Max: 0.392511
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.006876
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010629 | Grad Max: 0.010629
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000720 | Grad Max: 0.152706
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012473 | Grad Max: 0.842636
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000059 | Grad Max: 0.004775
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002874 | Grad Max: 0.027660
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000170
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000482 | Grad Max: 0.002282
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000067
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000109 | Grad Max: 0.000631
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000191 | Grad Max: 0.000670
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000547 | Grad Max: 0.000547
[GRADIENT NORM TOTAL] 2.4893

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.543
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409642  0.45903584] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.547 | Std: 0.058
[MASKS] A(Pass/Fail): 661/1387 | B: 601/1447 | C: 414/1634
[LOSS Ex1] A: 0.65239 | B: 0.64292 | C: 0.63764
[LOGITS Ex2 A] Mean Abs: 1.954 | Max: 5.865
[LOSS Ex2] A: 0.15213 | B: 0.36697 | C: 0.25630
** [JOINT LOSS] ** : 0.902780
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003750 | Grad Max: 0.147732
  -> Layer: shared_layers.0.bias | Grad Mean: 0.117029 | Grad Max: 0.477745
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.006725
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011117 | Grad Max: 0.011117
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000964 | Grad Max: 0.154731
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015965 | Grad Max: 0.854128
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000066 | Grad Max: 0.003823
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002392 | Grad Max: 0.031446
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000127
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000317 | Grad Max: 0.001819
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000065
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000523
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000282 | Grad Max: 0.000712
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000889 | Grad Max: 0.000889
[GRADIENT NORM TOTAL] 2.8621

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.691
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7130228  0.28697714] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.061
[MASKS] A(Pass/Fail): 708/1340 | B: 602/1446 | C: 416/1632
[LOSS Ex1] A: 0.64708 | B: 0.63878 | C: 0.63703
[LOGITS Ex2 A] Mean Abs: 1.993 | Max: 6.524
[LOSS Ex2] A: 0.13816 | B: 0.34663 | C: 0.26482
** [JOINT LOSS] ** : 0.890834
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.042926
  -> Layer: shared_layers.0.bias | Grad Mean: 0.108781 | Grad Max: 0.437262
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002361 | Grad Max: 0.006639
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009480 | Grad Max: 0.009480
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000832 | Grad Max: 0.327847
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014933 | Grad Max: 1.831866
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.004138
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004692 | Grad Max: 0.034681
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000186
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000951 | Grad Max: 0.003162
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000084
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000253 | Grad Max: 0.000791
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000363 | Grad Max: 0.001087
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004415 | Grad Max: 0.004415
[GRADIENT NORM TOTAL] 3.4236

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.770
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009303  0.49906972] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.060
[MASKS] A(Pass/Fail): 697/1351 | B: 567/1289 | C: 272/1104
[LOSS Ex1] A: 0.65304 | B: 0.64285 | C: 0.63800
[LOGITS Ex2 A] Mean Abs: 1.991 | Max: 5.957
[LOSS Ex2] A: 0.13570 | B: 0.36123 | C: 0.29365
** [JOINT LOSS] ** : 0.908159
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005506 | Grad Max: 0.169706
  -> Layer: shared_layers.0.bias | Grad Mean: 0.282008 | Grad Max: 1.068803
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005618
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003732 | Grad Max: 0.003732
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001921 | Grad Max: 0.401939
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034872 | Grad Max: 2.245164
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.010675
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016433 | Grad Max: 0.083245
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000392
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003628 | Grad Max: 0.007835
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000218
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000966 | Grad Max: 0.002578
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000954 | Grad Max: 0.001936
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019408 | Grad Max: 0.019408
[GRADIENT NORM TOTAL] 6.1825

[EPOCH SUMMARY] Train Loss: 0.9005

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8813 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8840 -> New: 0.8813)

############################## EPOCH 105/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.527
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6705793  0.32942063] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.061
[MASKS] A(Pass/Fail): 681/1367 | B: 603/1445 | C: 406/1642
[LOSS Ex1] A: 0.64943 | B: 0.64191 | C: 0.63500
[LOGITS Ex2 A] Mean Abs: 1.991 | Max: 6.205
[LOSS Ex2] A: 0.14938 | B: 0.36587 | C: 0.27548
** [JOINT LOSS] ** : 0.905688
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002999 | Grad Max: 0.086351
  -> Layer: shared_layers.0.bias | Grad Mean: 0.159070 | Grad Max: 0.675847
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.006203
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000596 | Grad Max: 0.000596
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001058 | Grad Max: 0.144057
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018836 | Grad Max: 0.775138
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000139 | Grad Max: 0.006549
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008185 | Grad Max: 0.048351
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000282
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001822 | Grad Max: 0.004792
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000122
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000492 | Grad Max: 0.001361
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000486 | Grad Max: 0.001165
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009928 | Grad Max: 0.009928
[GRADIENT NORM TOTAL] 3.1712

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.614
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6060094  0.39399058] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.061
[MASKS] A(Pass/Fail): 576/1040 | B: 602/1446 | C: 433/1615
[LOSS Ex1] A: 0.64782 | B: 0.64268 | C: 0.63424
[LOGITS Ex2 A] Mean Abs: 2.067 | Max: 6.491
[LOSS Ex2] A: 0.14135 | B: 0.36790 | C: 0.27029
** [JOINT LOSS] ** : 0.901429
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007452 | Grad Max: 0.202066
  -> Layer: shared_layers.0.bias | Grad Mean: 0.477778 | Grad Max: 2.311432
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002194 | Grad Max: 0.006524
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005937 | Grad Max: 0.005937
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003188 | Grad Max: 0.368864
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058916 | Grad Max: 2.062617
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000472 | Grad Max: 0.015984
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028523 | Grad Max: 0.140471
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000646
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006132 | Grad Max: 0.012467
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000304
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001605 | Grad Max: 0.003878
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001458 | Grad Max: 0.002930
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030594 | Grad Max: 0.030594
[GRADIENT NORM TOTAL] 9.8399

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.773
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070545 0.4929455] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.062
[MASKS] A(Pass/Fail): 690/1358 | B: 603/1445 | C: 422/1626
[LOSS Ex1] A: 0.64830 | B: 0.63854 | C: 0.63492
[LOGITS Ex2 A] Mean Abs: 2.043 | Max: 6.941
[LOSS Ex2] A: 0.14671 | B: 0.35574 | C: 0.28943
** [JOINT LOSS] ** : 0.904544
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008898 | Grad Max: 0.271319
  -> Layer: shared_layers.0.bias | Grad Mean: 0.510641 | Grad Max: 2.365273
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.005658
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001142 | Grad Max: 0.001142
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003472 | Grad Max: 0.404797
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.063632 | Grad Max: 2.261539
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000507 | Grad Max: 0.017577
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030354 | Grad Max: 0.170910
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000725
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006550 | Grad Max: 0.013795
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000332
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001717 | Grad Max: 0.003925
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001629 | Grad Max: 0.003266
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033472 | Grad Max: 0.033472
[GRADIENT NORM TOTAL] 10.5019

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.725
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5109345  0.48906556] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.061
[MASKS] A(Pass/Fail): 690/1358 | B: 567/1289 | C: 406/1642
[LOSS Ex1] A: 0.64559 | B: 0.64263 | C: 0.63689
[LOGITS Ex2 A] Mean Abs: 2.030 | Max: 6.960
[LOSS Ex2] A: 0.13781 | B: 0.33962 | C: 0.28694
** [JOINT LOSS] ** : 0.896497
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005052 | Grad Max: 0.136568
  -> Layer: shared_layers.0.bias | Grad Mean: 0.132034 | Grad Max: 0.643784
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002300 | Grad Max: 0.006530
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007833 | Grad Max: 0.007833
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001028 | Grad Max: 0.167035
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018137 | Grad Max: 0.932139
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000142 | Grad Max: 0.004209
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008024 | Grad Max: 0.035266
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000276
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001820 | Grad Max: 0.004585
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000131
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000495 | Grad Max: 0.001390
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000560 | Grad Max: 0.001398
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010773 | Grad Max: 0.010773
[GRADIENT NORM TOTAL] 3.0135

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.748
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029956 0.4970044] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.061
[MASKS] A(Pass/Fail): 684/1364 | B: 604/1444 | C: 411/1637
[LOSS Ex1] A: 0.64459 | B: 0.64170 | C: 0.63942
[LOGITS Ex2 A] Mean Abs: 1.930 | Max: 6.900
[LOSS Ex2] A: 0.15843 | B: 0.40906 | C: 0.28181
** [JOINT LOSS] ** : 0.924999
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006659 | Grad Max: 0.228085
  -> Layer: shared_layers.0.bias | Grad Mean: 0.723277 | Grad Max: 3.110711
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002205 | Grad Max: 0.007058
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008381 | Grad Max: 0.008381
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004458 | Grad Max: 0.653719
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.083726 | Grad Max: 3.683842
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000694 | Grad Max: 0.025008
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.042519 | Grad Max: 0.244467
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000987
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008792 | Grad Max: 0.018090
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000420
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002320 | Grad Max: 0.005475
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002243 | Grad Max: 0.004364
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046979 | Grad Max: 0.046979
[GRADIENT NORM TOTAL] 14.5055

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.619
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50299233 0.49700767] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.060
[MASKS] A(Pass/Fail): 660/1388 | B: 602/1446 | C: 405/1643
[LOSS Ex1] A: 0.65219 | B: 0.64248 | C: 0.63804
[LOGITS Ex2 A] Mean Abs: 1.903 | Max: 6.366
[LOSS Ex2] A: 0.14474 | B: 0.41638 | C: 0.27885
** [JOINT LOSS] ** : 0.924224
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011520 | Grad Max: 0.264813
  -> Layer: shared_layers.0.bias | Grad Mean: 0.902009 | Grad Max: 3.676590
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.005800
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003673 | Grad Max: 0.003673
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005761 | Grad Max: 0.737653
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.108282 | Grad Max: 4.156323
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000905 | Grad Max: 0.030505
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.055080 | Grad Max: 0.282614
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001216
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011627 | Grad Max: 0.023436
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000563
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003065 | Grad Max: 0.007174
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002976 | Grad Max: 0.005624
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.061562 | Grad Max: 0.061562
[GRADIENT NORM TOTAL] 17.9378

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.548
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5411448  0.45885518] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.547 | Std: 0.058
[MASKS] A(Pass/Fail): 661/1387 | B: 603/1445 | C: 391/1657
[LOSS Ex1] A: 0.65208 | B: 0.63834 | C: 0.63899
[LOGITS Ex2 A] Mean Abs: 1.893 | Max: 5.985
[LOSS Ex2] A: 0.15113 | B: 0.37881 | C: 0.27447
** [JOINT LOSS] ** : 0.911276
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008997 | Grad Max: 0.226028
  -> Layer: shared_layers.0.bias | Grad Mean: 0.577809 | Grad Max: 2.362106
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002140 | Grad Max: 0.005544
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004387 | Grad Max: 0.004387
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003742 | Grad Max: 0.452272
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070308 | Grad Max: 2.415835
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000595 | Grad Max: 0.019633
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036093 | Grad Max: 0.194101
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000831
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007638 | Grad Max: 0.016456
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000376
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002008 | Grad Max: 0.004732
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001993 | Grad Max: 0.003560
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040007 | Grad Max: 0.040007
[GRADIENT NORM TOTAL] 11.3614

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.698
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.714679   0.28532094] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.061
[MASKS] A(Pass/Fail): 708/1340 | B: 567/1289 | C: 401/1647
[LOSS Ex1] A: 0.64677 | B: 0.64245 | C: 0.63862
[LOGITS Ex2 A] Mean Abs: 2.002 | Max: 6.132
[LOSS Ex2] A: 0.13640 | B: 0.34827 | C: 0.28876
** [JOINT LOSS] ** : 0.900422
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002271 | Grad Max: 0.056541
  -> Layer: shared_layers.0.bias | Grad Mean: 0.081321 | Grad Max: 0.407096
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002194 | Grad Max: 0.006243
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004607 | Grad Max: 0.004607
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000714 | Grad Max: 0.141690
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012403 | Grad Max: 0.798022
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.004265
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003887 | Grad Max: 0.034846
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000156
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000786 | Grad Max: 0.003213
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000209 | Grad Max: 0.000791
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000223 | Grad Max: 0.000939
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003877 | Grad Max: 0.003877
[GRADIENT NORM TOTAL] 2.3834

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.777
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50092936 0.4990706 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.061
[MASKS] A(Pass/Fail): 697/1351 | B: 605/1443 | C: 406/1642
[LOSS Ex1] A: 0.65277 | B: 0.64152 | C: 0.63584
[LOGITS Ex2 A] Mean Abs: 2.023 | Max: 5.830
[LOSS Ex2] A: 0.13078 | B: 0.37968 | C: 0.27281
** [JOINT LOSS] ** : 0.904465
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004181 | Grad Max: 0.157812
  -> Layer: shared_layers.0.bias | Grad Mean: 0.424656 | Grad Max: 1.921044
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.006037
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005813 | Grad Max: 0.005813
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002739 | Grad Max: 0.341466
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050985 | Grad Max: 1.884191
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000433 | Grad Max: 0.015987
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026777 | Grad Max: 0.153787
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000564
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005609 | Grad Max: 0.011757
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000295
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001475 | Grad Max: 0.003783
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001329 | Grad Max: 0.002776
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029201 | Grad Max: 0.029201
[GRADIENT NORM TOTAL] 8.6682

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.531
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.67173374 0.32826623] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.061
[MASKS] A(Pass/Fail): 681/1367 | B: 602/1446 | C: 413/1635
[LOSS Ex1] A: 0.64915 | B: 0.64230 | C: 0.63693
[LOGITS Ex2 A] Mean Abs: 2.001 | Max: 5.397
[LOSS Ex2] A: 0.16017 | B: 0.36239 | C: 0.25805
** [JOINT LOSS] ** : 0.902997
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004289 | Grad Max: 0.116330
  -> Layer: shared_layers.0.bias | Grad Mean: 0.300202 | Grad Max: 1.264248
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.005599
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001828 | Grad Max: 0.001828
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.341771
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035231 | Grad Max: 1.898149
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000274 | Grad Max: 0.010792
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016852 | Grad Max: 0.101600
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000354
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003421 | Grad Max: 0.007528
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000162
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000903 | Grad Max: 0.002209
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000752 | Grad Max: 0.001896
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017901 | Grad Max: 0.017901
[GRADIENT NORM TOTAL] 6.3194

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.619
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.60671204 0.39328793] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.062
[MASKS] A(Pass/Fail): 577/1039 | B: 603/1445 | C: 416/1632
[LOSS Ex1] A: 0.64754 | B: 0.63816 | C: 0.63519
[LOGITS Ex2 A] Mean Abs: 2.012 | Max: 5.858
[LOSS Ex2] A: 0.14788 | B: 0.34520 | C: 0.26841
** [JOINT LOSS] ** : 0.894130
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005707 | Grad Max: 0.173097
  -> Layer: shared_layers.0.bias | Grad Mean: 0.277185 | Grad Max: 1.108382
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002275 | Grad Max: 0.006760
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007223 | Grad Max: 0.007223
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001829 | Grad Max: 0.515995
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033450 | Grad Max: 2.889395
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000255 | Grad Max: 0.010928
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015121 | Grad Max: 0.091395
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000412
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003270 | Grad Max: 0.007566
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000197
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000866 | Grad Max: 0.002185
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000906 | Grad Max: 0.002160
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017843 | Grad Max: 0.017843
[GRADIENT NORM TOTAL] 6.2330

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.778
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070422  0.49295774] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.062
[MASKS] A(Pass/Fail): 692/1356 | B: 569/1287 | C: 394/1654
[LOSS Ex1] A: 0.64804 | B: 0.64227 | C: 0.63887
[LOGITS Ex2 A] Mean Abs: 1.993 | Max: 6.620
[LOSS Ex2] A: 0.14167 | B: 0.35229 | C: 0.27703
** [JOINT LOSS] ** : 0.900058
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005610 | Grad Max: 0.195088
  -> Layer: shared_layers.0.bias | Grad Mean: 0.308712 | Grad Max: 1.369542
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002271 | Grad Max: 0.006415
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011016 | Grad Max: 0.011016
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.420056
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037771 | Grad Max: 2.350629
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.010529
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017136 | Grad Max: 0.083133
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000414
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003718 | Grad Max: 0.007961
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000190
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000992 | Grad Max: 0.002367
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000946 | Grad Max: 0.002091
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019934 | Grad Max: 0.019934
[GRADIENT NORM TOTAL] 6.6271

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.730
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5109994 0.4890006] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.062
[MASKS] A(Pass/Fail): 690/1358 | B: 605/1443 | C: 441/1607
[LOSS Ex1] A: 0.64533 | B: 0.64135 | C: 0.63220
[LOGITS Ex2 A] Mean Abs: 1.987 | Max: 5.925
[LOSS Ex2] A: 0.14608 | B: 0.36132 | C: 0.25167
** [JOINT LOSS] ** : 0.892648
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001728 | Grad Max: 0.038921
  -> Layer: shared_layers.0.bias | Grad Mean: 0.101603 | Grad Max: 0.583791
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002285 | Grad Max: 0.006359
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004938 | Grad Max: 0.004938
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000676 | Grad Max: 0.216822
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011912 | Grad Max: 1.221963
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.004297
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004063 | Grad Max: 0.027800
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000214
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000768 | Grad Max: 0.003573
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000073
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000197 | Grad Max: 0.000773
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000386 | Grad Max: 0.001214
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003707 | Grad Max: 0.003707
[GRADIENT NORM TOTAL] 2.5906

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.753
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.502986 0.497014] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.061
[MASKS] A(Pass/Fail): 684/1364 | B: 603/1445 | C: 327/1049
[LOSS Ex1] A: 0.64435 | B: 0.64213 | C: 0.62956
[LOGITS Ex2 A] Mean Abs: 2.022 | Max: 6.570
[LOSS Ex2] A: 0.16313 | B: 0.37554 | C: 0.23716
** [JOINT LOSS] ** : 0.897287
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007993 | Grad Max: 0.261216
  -> Layer: shared_layers.0.bias | Grad Mean: 0.505051 | Grad Max: 2.303274
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.007127
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001136 | Grad Max: 0.001136
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003427 | Grad Max: 0.438750
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.062319 | Grad Max: 2.448532
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000521 | Grad Max: 0.015888
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031593 | Grad Max: 0.154444
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000673
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006905 | Grad Max: 0.014321
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000310
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001855 | Grad Max: 0.003990
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001730 | Grad Max: 0.003597
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037145 | Grad Max: 0.037145
[GRADIENT NORM TOTAL] 10.4053

[EPOCH SUMMARY] Train Loss: 0.9043

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8901 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 106/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.623
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50290793 0.49709207] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.060
[MASKS] A(Pass/Fail): 660/1388 | B: 603/1445 | C: 430/1618
[LOSS Ex1] A: 0.65198 | B: 0.63799 | C: 0.63322
[LOGITS Ex2 A] Mean Abs: 1.996 | Max: 6.214
[LOSS Ex2] A: 0.14142 | B: 0.35373 | C: 0.28787
** [JOINT LOSS] ** : 0.902068
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007735 | Grad Max: 0.205423
  -> Layer: shared_layers.0.bias | Grad Mean: 0.549919 | Grad Max: 2.513360
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002228 | Grad Max: 0.006726
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011023 | Grad Max: 0.011023
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003505 | Grad Max: 0.465356
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064933 | Grad Max: 2.607980
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000531 | Grad Max: 0.019639
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.032447 | Grad Max: 0.183136
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000718
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006930 | Grad Max: 0.014430
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000362
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001828 | Grad Max: 0.004503
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001607 | Grad Max: 0.003142
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034971 | Grad Max: 0.034971
[GRADIENT NORM TOTAL] 11.2645

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.552
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5412331 0.4587669] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.547 | Std: 0.059
[MASKS] A(Pass/Fail): 662/1386 | B: 569/1287 | C: 450/1598
[LOSS Ex1] A: 0.65188 | B: 0.64211 | C: 0.63691
[LOGITS Ex2 A] Mean Abs: 1.931 | Max: 5.882
[LOSS Ex2] A: 0.14641 | B: 0.34000 | C: 0.24851
** [JOINT LOSS] ** : 0.888607
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003888 | Grad Max: 0.113714
  -> Layer: shared_layers.0.bias | Grad Mean: 0.098523 | Grad Max: 0.485299
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002075 | Grad Max: 0.005718
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003794 | Grad Max: 0.003794
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000851 | Grad Max: 0.139101
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014576 | Grad Max: 0.766629
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000093 | Grad Max: 0.004791
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005078 | Grad Max: 0.036864
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000178
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001193 | Grad Max: 0.003700
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000335 | Grad Max: 0.001022
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000524 | Grad Max: 0.001740
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007653 | Grad Max: 0.007653
[GRADIENT NORM TOTAL] 2.4158

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.702
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7159532 0.2840468] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.062
[MASKS] A(Pass/Fail): 708/1340 | B: 605/1443 | C: 408/1640
[LOSS Ex1] A: 0.64653 | B: 0.64118 | C: 0.63904
[LOGITS Ex2 A] Mean Abs: 1.958 | Max: 6.332
[LOSS Ex2] A: 0.13711 | B: 0.39916 | C: 0.28609
** [JOINT LOSS] ** : 0.916372
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006666 | Grad Max: 0.202244
  -> Layer: shared_layers.0.bias | Grad Mean: 0.596156 | Grad Max: 2.730278
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.006048
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002975 | Grad Max: 0.002975
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003692 | Grad Max: 0.462359
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069274 | Grad Max: 2.560252
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000571 | Grad Max: 0.018688
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035253 | Grad Max: 0.185974
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000732
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007447 | Grad Max: 0.014967
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000375
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001974 | Grad Max: 0.004756
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001860 | Grad Max: 0.003592
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039313 | Grad Max: 0.039313
[GRADIENT NORM TOTAL] 11.8024

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.781
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50098455 0.49901545] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.061
[MASKS] A(Pass/Fail): 699/1349 | B: 604/1444 | C: 434/1614
[LOSS Ex1] A: 0.65256 | B: 0.64196 | C: 0.63893
[LOGITS Ex2 A] Mean Abs: 1.945 | Max: 5.796
[LOSS Ex2] A: 0.13792 | B: 0.39658 | C: 0.29336
** [JOINT LOSS] ** : 0.920441
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010550 | Grad Max: 0.262212
  -> Layer: shared_layers.0.bias | Grad Mean: 0.758947 | Grad Max: 3.079138
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.005470
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001893 | Grad Max: 0.001893
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004866 | Grad Max: 0.566247
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.090617 | Grad Max: 3.164000
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000771 | Grad Max: 0.025905
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047369 | Grad Max: 0.267800
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000096 | Grad Max: 0.000993
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010147 | Grad Max: 0.020384
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000443
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002696 | Grad Max: 0.006007
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002619 | Grad Max: 0.005069
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.054420 | Grad Max: 0.054420
[GRADIENT NORM TOTAL] 14.7533

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.535
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.67262954 0.32737046] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.062
[MASKS] A(Pass/Fail): 683/1365 | B: 604/1444 | C: 429/1619
[LOSS Ex1] A: 0.64893 | B: 0.63783 | C: 0.63682
[LOGITS Ex2 A] Mean Abs: 1.952 | Max: 6.116
[LOSS Ex2] A: 0.15427 | B: 0.35974 | C: 0.26836
** [JOINT LOSS] ** : 0.901983
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007448 | Grad Max: 0.225885
  -> Layer: shared_layers.0.bias | Grad Mean: 0.396249 | Grad Max: 1.493262
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.006249
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002768 | Grad Max: 0.002768
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002552 | Grad Max: 0.287423
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046782 | Grad Max: 1.444238
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000399 | Grad Max: 0.012117
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024243 | Grad Max: 0.115778
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000620
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005232 | Grad Max: 0.011567
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000270
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001375 | Grad Max: 0.003365
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001336 | Grad Max: 0.002602
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027369 | Grad Max: 0.027369
[GRADIENT NORM TOTAL] 7.5145

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.623
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6071847  0.39281532] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.062
[MASKS] A(Pass/Fail): 577/1039 | B: 569/1287 | C: 417/1631
[LOSS Ex1] A: 0.64732 | B: 0.64194 | C: 0.63910
[LOGITS Ex2 A] Mean Abs: 2.055 | Max: 7.247
[LOSS Ex2] A: 0.13565 | B: 0.34695 | C: 0.27356
** [JOINT LOSS] ** : 0.894841
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003373 | Grad Max: 0.128613
  -> Layer: shared_layers.0.bias | Grad Mean: 0.264962 | Grad Max: 1.506265
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.005882
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003555 | Grad Max: 0.003555
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001779 | Grad Max: 0.253834
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031933 | Grad Max: 1.378420
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000241 | Grad Max: 0.009609
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014647 | Grad Max: 0.078670
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000415
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002935 | Grad Max: 0.007541
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000167
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000764 | Grad Max: 0.001909
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000655 | Grad Max: 0.001714
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015016 | Grad Max: 0.015016
[GRADIENT NORM TOTAL] 5.8616

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.783
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070288  0.49297115] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.062
[MASKS] A(Pass/Fail): 693/1355 | B: 606/1442 | C: 447/1601
[LOSS Ex1] A: 0.64783 | B: 0.64103 | C: 0.63475
[LOGITS Ex2 A] Mean Abs: 2.033 | Max: 6.367
[LOSS Ex2] A: 0.14316 | B: 0.38510 | C: 0.27357
** [JOINT LOSS] ** : 0.908478
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006251 | Grad Max: 0.208559
  -> Layer: shared_layers.0.bias | Grad Mean: 0.583420 | Grad Max: 2.768114
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.005814
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001406 | Grad Max: 0.001406
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003683 | Grad Max: 0.426443
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069402 | Grad Max: 2.347536
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000580 | Grad Max: 0.021534
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035887 | Grad Max: 0.195433
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000793
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007504 | Grad Max: 0.015661
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000329
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001961 | Grad Max: 0.004358
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001767 | Grad Max: 0.003372
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037481 | Grad Max: 0.037481
[GRADIENT NORM TOTAL] 11.9518

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.735
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51094633 0.4890536 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.062
[MASKS] A(Pass/Fail): 690/1358 | B: 604/1444 | C: 413/1635
[LOSS Ex1] A: 0.64510 | B: 0.64182 | C: 0.63776
[LOGITS Ex2 A] Mean Abs: 2.033 | Max: 5.786
[LOSS Ex2] A: 0.15097 | B: 0.36758 | C: 0.26195
** [JOINT LOSS] ** : 0.901728
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005237 | Grad Max: 0.187891
  -> Layer: shared_layers.0.bias | Grad Mean: 0.458817 | Grad Max: 2.354578
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002239 | Grad Max: 0.006271
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004028 | Grad Max: 0.004028
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002909 | Grad Max: 0.355134
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054464 | Grad Max: 1.998014
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.015701
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027518 | Grad Max: 0.155586
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000652
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005743 | Grad Max: 0.012412
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000269
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001520 | Grad Max: 0.003620
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001393 | Grad Max: 0.002982
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029916 | Grad Max: 0.029916
[GRADIENT NORM TOTAL] 9.6202

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.758
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50302607 0.4969739 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.062
[MASKS] A(Pass/Fail): 684/1364 | B: 605/1443 | C: 447/1601
[LOSS Ex1] A: 0.64412 | B: 0.63769 | C: 0.63249
[LOGITS Ex2 A] Mean Abs: 1.988 | Max: 6.564
[LOSS Ex2] A: 0.15425 | B: 0.35069 | C: 0.24053
** [JOINT LOSS] ** : 0.886591
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.046923
  -> Layer: shared_layers.0.bias | Grad Mean: 0.136690 | Grad Max: 0.584834
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002383 | Grad Max: 0.006514
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003009 | Grad Max: 0.003009
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000940 | Grad Max: 0.383743
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016637 | Grad Max: 2.148637
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.003992
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005348 | Grad Max: 0.032229
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000182
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001095 | Grad Max: 0.003683
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000087
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000289 | Grad Max: 0.000863
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000349 | Grad Max: 0.001114
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005809 | Grad Max: 0.005809
[GRADIENT NORM TOTAL] 3.8840

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.626
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5028864  0.49711356] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.548 | Std: 0.061
[MASKS] A(Pass/Fail): 662/1386 | B: 569/1287 | C: 441/1607
[LOSS Ex1] A: 0.65178 | B: 0.64181 | C: 0.63365
[LOGITS Ex2 A] Mean Abs: 1.931 | Max: 6.411
[LOSS Ex2] A: 0.14385 | B: 0.35678 | C: 0.27425
** [JOINT LOSS] ** : 0.900707
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005909 | Grad Max: 0.144674
  -> Layer: shared_layers.0.bias | Grad Mean: 0.459312 | Grad Max: 1.988287
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.005858
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006472 | Grad Max: 0.006472
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002915 | Grad Max: 0.464472
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054475 | Grad Max: 2.600086
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000440 | Grad Max: 0.015749
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027235 | Grad Max: 0.161057
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000581
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005819 | Grad Max: 0.011864
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000301
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001546 | Grad Max: 0.003731
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001507 | Grad Max: 0.002833
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031042 | Grad Max: 0.031042
[GRADIENT NORM TOTAL] 9.3580

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.555
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54129666 0.45870334] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.059
[MASKS] A(Pass/Fail): 662/1386 | B: 608/1440 | C: 420/1628
[LOSS Ex1] A: 0.65169 | B: 0.64090 | C: 0.63691
[LOGITS Ex2 A] Mean Abs: 1.930 | Max: 5.970
[LOSS Ex2] A: 0.14682 | B: 0.37510 | C: 0.26786
** [JOINT LOSS] ** : 0.906427
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003825 | Grad Max: 0.105519
  -> Layer: shared_layers.0.bias | Grad Mean: 0.272900 | Grad Max: 1.479888
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002115 | Grad Max: 0.005814
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000238 | Grad Max: 0.000238
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001807 | Grad Max: 0.257144
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032965 | Grad Max: 1.423256
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000253 | Grad Max: 0.009375
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015602 | Grad Max: 0.084971
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000392
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003365 | Grad Max: 0.007513
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000183
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000898 | Grad Max: 0.001995
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000841 | Grad Max: 0.001860
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017968 | Grad Max: 0.017968
[GRADIENT NORM TOTAL] 5.7688

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.706
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.71697605 0.28302395] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.062
[MASKS] A(Pass/Fail): 710/1338 | B: 605/1443 | C: 453/1595
[LOSS Ex1] A: 0.64633 | B: 0.64169 | C: 0.63270
[LOGITS Ex2 A] Mean Abs: 2.014 | Max: 6.617
[LOSS Ex2] A: 0.13875 | B: 0.36197 | C: 0.26125
** [JOINT LOSS] ** : 0.894230
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005385 | Grad Max: 0.146922
  -> Layer: shared_layers.0.bias | Grad Mean: 0.349903 | Grad Max: 1.247234
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002291 | Grad Max: 0.006191
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005873 | Grad Max: 0.005873
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.339470
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042502 | Grad Max: 1.907193
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.011711
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020376 | Grad Max: 0.114601
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000473
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004343 | Grad Max: 0.009834
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000226
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001140 | Grad Max: 0.002798
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000991 | Grad Max: 0.002237
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020975 | Grad Max: 0.020975
[GRADIENT NORM TOTAL] 7.1898

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.786
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50093734 0.49906266] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.061
[MASKS] A(Pass/Fail): 699/1349 | B: 605/1443 | C: 447/1601
[LOSS Ex1] A: 0.65238 | B: 0.63756 | C: 0.63555
[LOGITS Ex2 A] Mean Abs: 2.023 | Max: 5.664
[LOSS Ex2] A: 0.13594 | B: 0.34700 | C: 0.26856
** [JOINT LOSS] ** : 0.892331
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005078 | Grad Max: 0.132285
  -> Layer: shared_layers.0.bias | Grad Mean: 0.407875 | Grad Max: 1.653904
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.005760
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002434 | Grad Max: 0.002434
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002602 | Grad Max: 0.402678
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048633 | Grad Max: 2.253958
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000395 | Grad Max: 0.014259
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024592 | Grad Max: 0.140473
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000558
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005202 | Grad Max: 0.011112
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000263
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001378 | Grad Max: 0.003349
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001285 | Grad Max: 0.002996
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027231 | Grad Max: 0.027231
[GRADIENT NORM TOTAL] 8.3461

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.538
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6733552  0.32664478] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.062
[MASKS] A(Pass/Fail): 683/1365 | B: 569/1287 | C: 331/1045
[LOSS Ex1] A: 0.64873 | B: 0.64169 | C: 0.62964
[LOGITS Ex2 A] Mean Abs: 1.995 | Max: 6.022
[LOSS Ex2] A: 0.15172 | B: 0.34906 | C: 0.26923
** [JOINT LOSS] ** : 0.896690
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001896 | Grad Max: 0.043161
  -> Layer: shared_layers.0.bias | Grad Mean: 0.057338 | Grad Max: 0.284761
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002230 | Grad Max: 0.006251
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000902 | Grad Max: 0.000902
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000550 | Grad Max: 0.150429
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009289 | Grad Max: 0.843301
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.003400
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002346 | Grad Max: 0.018381
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000143
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000448 | Grad Max: 0.002320
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000059
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000122 | Grad Max: 0.000556
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000214 | Grad Max: 0.000730
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001354 | Grad Max: 0.001354
[GRADIENT NORM TOTAL] 1.8810

[EPOCH SUMMARY] Train Loss: 0.9008

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8797 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8813 -> New: 0.8797)

############################## EPOCH 107/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.626
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6075865  0.39241353] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.062
[MASKS] A(Pass/Fail): 578/1038 | B: 608/1440 | C: 463/1585
[LOSS Ex1] A: 0.64712 | B: 0.64077 | C: 0.63346
[LOGITS Ex2 A] Mean Abs: 2.009 | Max: 6.424
[LOSS Ex2] A: 0.13753 | B: 0.37468 | C: 0.24130
** [JOINT LOSS] ** : 0.891616
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003727 | Grad Max: 0.097203
  -> Layer: shared_layers.0.bias | Grad Mean: 0.226806 | Grad Max: 0.905685
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.005855
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003308 | Grad Max: 0.003308
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001561 | Grad Max: 0.225247
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028439 | Grad Max: 1.249406
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000226 | Grad Max: 0.009900
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013889 | Grad Max: 0.092289
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000413
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003023 | Grad Max: 0.006628
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000170
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000807 | Grad Max: 0.002052
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000775 | Grad Max: 0.001858
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015965 | Grad Max: 0.015965
[GRADIENT NORM TOTAL] 4.7310

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.787
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50706255 0.49293745] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.063
[MASKS] A(Pass/Fail): 694/1354 | B: 605/1443 | C: 432/1616
[LOSS Ex1] A: 0.64763 | B: 0.64155 | C: 0.63755
[LOGITS Ex2 A] Mean Abs: 2.007 | Max: 6.342
[LOSS Ex2] A: 0.13789 | B: 0.36515 | C: 0.25942
** [JOINT LOSS] ** : 0.896393
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001863 | Grad Max: 0.098911
  -> Layer: shared_layers.0.bias | Grad Mean: 0.056185 | Grad Max: 0.309922
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002170 | Grad Max: 0.005994
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001920 | Grad Max: 0.001920
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000490 | Grad Max: 0.103453
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.008134 | Grad Max: 0.582226
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.002797
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001684 | Grad Max: 0.017922
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000124
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000294 | Grad Max: 0.001930
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000059
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000074 | Grad Max: 0.000480
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000339 | Grad Max: 0.000870
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000281 | Grad Max: 0.000281
[GRADIENT NORM TOTAL] 1.6406

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.739
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5109316  0.48906842] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.062
[MASKS] A(Pass/Fail): 690/1358 | B: 605/1443 | C: 462/1586
[LOSS Ex1] A: 0.64488 | B: 0.63739 | C: 0.63203
[LOGITS Ex2 A] Mean Abs: 2.015 | Max: 6.477
[LOSS Ex2] A: 0.14333 | B: 0.34206 | C: 0.27288
** [JOINT LOSS] ** : 0.890861
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004413 | Grad Max: 0.110087
  -> Layer: shared_layers.0.bias | Grad Mean: 0.225452 | Grad Max: 0.946322
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002435 | Grad Max: 0.006858
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009392 | Grad Max: 0.009392
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001566 | Grad Max: 0.297597
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028712 | Grad Max: 1.667716
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000234 | Grad Max: 0.008437
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014186 | Grad Max: 0.069135
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000385
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003084 | Grad Max: 0.006922
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000187
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000816 | Grad Max: 0.002064
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000780 | Grad Max: 0.002138
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016297 | Grad Max: 0.016297
[GRADIENT NORM TOTAL] 4.9064

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.762
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50304985 0.49695015] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.062
[MASKS] A(Pass/Fail): 684/1364 | B: 570/1286 | C: 471/1577
[LOSS Ex1] A: 0.64389 | B: 0.64151 | C: 0.63366
[LOGITS Ex2 A] Mean Abs: 1.969 | Max: 5.804
[LOSS Ex2] A: 0.15585 | B: 0.34996 | C: 0.25910
** [JOINT LOSS] ** : 0.894657
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002610 | Grad Max: 0.086777
  -> Layer: shared_layers.0.bias | Grad Mean: 0.140579 | Grad Max: 0.591293
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.006325
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003217 | Grad Max: 0.003217
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000907 | Grad Max: 0.332006
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015670 | Grad Max: 1.855617
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000089 | Grad Max: 0.004134
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005036 | Grad Max: 0.034448
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000183
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000927 | Grad Max: 0.003278
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000084
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000238 | Grad Max: 0.000946
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000255 | Grad Max: 0.000947
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004334 | Grad Max: 0.004334
[GRADIENT NORM TOTAL] 3.3972

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.630
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5028401  0.49715987] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.061
[MASKS] A(Pass/Fail): 662/1386 | B: 608/1440 | C: 448/1600
[LOSS Ex1] A: 0.65157 | B: 0.64058 | C: 0.63575
[LOGITS Ex2 A] Mean Abs: 1.954 | Max: 6.808
[LOSS Ex2] A: 0.13375 | B: 0.36745 | C: 0.26984
** [JOINT LOSS] ** : 0.899647
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002504 | Grad Max: 0.060996
  -> Layer: shared_layers.0.bias | Grad Mean: 0.094363 | Grad Max: 0.488117
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.006087
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007844 | Grad Max: 0.007844
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000692 | Grad Max: 0.129941
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011815 | Grad Max: 0.725151
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000081 | Grad Max: 0.004402
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004554 | Grad Max: 0.036387
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000193
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000976 | Grad Max: 0.003221
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000082
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000285 | Grad Max: 0.000798
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000375 | Grad Max: 0.001288
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007148 | Grad Max: 0.007148
[GRADIENT NORM TOTAL] 2.2021

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.559
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54148585 0.45851412] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.059
[MASKS] A(Pass/Fail): 662/1386 | B: 607/1441 | C: 468/1580
[LOSS Ex1] A: 0.65148 | B: 0.64136 | C: 0.63158
[LOGITS Ex2 A] Mean Abs: 1.964 | Max: 6.371
[LOSS Ex2] A: 0.15099 | B: 0.36922 | C: 0.26369
** [JOINT LOSS] ** : 0.902772
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002915 | Grad Max: 0.064886
  -> Layer: shared_layers.0.bias | Grad Mean: 0.182904 | Grad Max: 0.867706
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.007814
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.016352 | Grad Max: 0.016352
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001238 | Grad Max: 0.190854
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022459 | Grad Max: 1.065493
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000148 | Grad Max: 0.006805
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009008 | Grad Max: 0.057484
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000248
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001857 | Grad Max: 0.005223
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000114
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000488 | Grad Max: 0.001400
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000460 | Grad Max: 0.001580
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009648 | Grad Max: 0.009648
[GRADIENT NORM TOTAL] 4.0687

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.711
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.71833163 0.28166834] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.062
[MASKS] A(Pass/Fail): 711/1337 | B: 608/1440 | C: 469/1579
[LOSS Ex1] A: 0.64609 | B: 0.63720 | C: 0.63224
[LOGITS Ex2 A] Mean Abs: 2.003 | Max: 6.662
[LOSS Ex2] A: 0.13262 | B: 0.34297 | C: 0.23266
** [JOINT LOSS] ** : 0.874591
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001635 | Grad Max: 0.037325
  -> Layer: shared_layers.0.bias | Grad Mean: 0.096186 | Grad Max: 0.418764
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002304 | Grad Max: 0.006173
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000439 | Grad Max: 0.000439
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000706 | Grad Max: 0.210897
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012349 | Grad Max: 1.196857
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.003380
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003294 | Grad Max: 0.027572
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000180
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000657 | Grad Max: 0.002826
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000065
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000178 | Grad Max: 0.000656
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000508 | Grad Max: 0.001200
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003727 | Grad Max: 0.003727
[GRADIENT NORM TOTAL] 2.6778

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.791
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50092536 0.49907464] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.061
[MASKS] A(Pass/Fail): 699/1349 | B: 570/1286 | C: 466/1582
[LOSS Ex1] A: 0.65215 | B: 0.64133 | C: 0.63564
[LOGITS Ex2 A] Mean Abs: 1.981 | Max: 6.423
[LOSS Ex2] A: 0.12880 | B: 0.34517 | C: 0.26345
** [JOINT LOSS] ** : 0.888847
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002946 | Grad Max: 0.098270
  -> Layer: shared_layers.0.bias | Grad Mean: 0.139688 | Grad Max: 0.586628
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005860
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004731 | Grad Max: 0.004731
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001029 | Grad Max: 0.209463
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018134 | Grad Max: 1.173887
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000128 | Grad Max: 0.005395
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007676 | Grad Max: 0.039797
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000232
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001709 | Grad Max: 0.004539
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000124
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000464 | Grad Max: 0.001461
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000466 | Grad Max: 0.001453
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009130 | Grad Max: 0.009130
[GRADIENT NORM TOTAL] 3.3050

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.542
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.674489   0.32551098] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.062
[MASKS] A(Pass/Fail): 683/1365 | B: 608/1440 | C: 452/1596
[LOSS Ex1] A: 0.64847 | B: 0.64039 | C: 0.63819
[LOGITS Ex2 A] Mean Abs: 1.984 | Max: 6.087
[LOSS Ex2] A: 0.13895 | B: 0.36968 | C: 0.28897
** [JOINT LOSS] ** : 0.908214
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.055958
  -> Layer: shared_layers.0.bias | Grad Mean: 0.081397 | Grad Max: 0.315782
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005931
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002764 | Grad Max: 0.002764
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000633 | Grad Max: 0.210123
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010930 | Grad Max: 1.184491
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.002887
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002457 | Grad Max: 0.020966
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000142
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000531 | Grad Max: 0.002534
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000158 | Grad Max: 0.000577
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000261 | Grad Max: 0.000883
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004361 | Grad Max: 0.004361
[GRADIENT NORM TOTAL] 2.5028

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.631
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.60815537 0.3918446 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.062
[MASKS] A(Pass/Fail): 578/1038 | B: 607/1441 | C: 445/1603
[LOSS Ex1] A: 0.64684 | B: 0.64116 | C: 0.63927
[LOGITS Ex2 A] Mean Abs: 2.026 | Max: 6.434
[LOSS Ex2] A: 0.13996 | B: 0.35614 | C: 0.27824
** [JOINT LOSS] ** : 0.900536
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003045 | Grad Max: 0.116475
  -> Layer: shared_layers.0.bias | Grad Mean: 0.202387 | Grad Max: 1.255913
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.005864
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000854 | Grad Max: 0.000854
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001351 | Grad Max: 0.241260
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024163 | Grad Max: 1.350985
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000158 | Grad Max: 0.004923
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009585 | Grad Max: 0.044032
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000253
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002136 | Grad Max: 0.005199
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000139
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000551 | Grad Max: 0.001623
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000448 | Grad Max: 0.001497
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009973 | Grad Max: 0.009973
[GRADIENT NORM TOTAL] 4.5643

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.793
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071182  0.49288175] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.063
[MASKS] A(Pass/Fail): 695/1353 | B: 608/1440 | C: 435/1613
[LOSS Ex1] A: 0.64734 | B: 0.63699 | C: 0.63510
[LOGITS Ex2 A] Mean Abs: 2.020 | Max: 8.029
[LOSS Ex2] A: 0.13246 | B: 0.33569 | C: 0.27631
** [JOINT LOSS] ** : 0.887966
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003552 | Grad Max: 0.138063
  -> Layer: shared_layers.0.bias | Grad Mean: 0.118398 | Grad Max: 0.479727
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002283 | Grad Max: 0.006035
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004881 | Grad Max: 0.004881
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000971 | Grad Max: 0.167201
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016946 | Grad Max: 0.918232
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000116 | Grad Max: 0.004602
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006824 | Grad Max: 0.033297
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000244
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001558 | Grad Max: 0.004704
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000103
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000414 | Grad Max: 0.001099
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000436 | Grad Max: 0.001462
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008108 | Grad Max: 0.008108
[GRADIENT NORM TOTAL] 2.9013

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.745
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51088524 0.4891148 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.063
[MASKS] A(Pass/Fail): 690/1358 | B: 570/1286 | C: 448/1600
[LOSS Ex1] A: 0.64458 | B: 0.64112 | C: 0.63615
[LOGITS Ex2 A] Mean Abs: 1.970 | Max: 5.546
[LOSS Ex2] A: 0.14369 | B: 0.35594 | C: 0.27471
** [JOINT LOSS] ** : 0.898727
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003609 | Grad Max: 0.121610
  -> Layer: shared_layers.0.bias | Grad Mean: 0.384691 | Grad Max: 1.705843
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002301 | Grad Max: 0.006804
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008879 | Grad Max: 0.008879
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002337 | Grad Max: 0.505654
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043259 | Grad Max: 2.818784
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000339 | Grad Max: 0.011650
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021276 | Grad Max: 0.119902
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000542
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004410 | Grad Max: 0.009250
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000213
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001155 | Grad Max: 0.002683
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001016 | Grad Max: 0.002085
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022419 | Grad Max: 0.022419
[GRADIENT NORM TOTAL] 8.0343

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.768
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50315887 0.49684113] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.062
[MASKS] A(Pass/Fail): 684/1364 | B: 608/1440 | C: 455/1593
[LOSS Ex1] A: 0.64358 | B: 0.64018 | C: 0.63643
[LOGITS Ex2 A] Mean Abs: 1.963 | Max: 7.228
[LOSS Ex2] A: 0.15715 | B: 0.37997 | C: 0.26795
** [JOINT LOSS] ** : 0.908427
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004347 | Grad Max: 0.159167
  -> Layer: shared_layers.0.bias | Grad Mean: 0.417892 | Grad Max: 2.163827
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002256 | Grad Max: 0.006317
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003501 | Grad Max: 0.003501
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002640 | Grad Max: 0.521056
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049372 | Grad Max: 2.905117
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000377 | Grad Max: 0.012853
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023601 | Grad Max: 0.127272
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000500
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004956 | Grad Max: 0.010163
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000252
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001323 | Grad Max: 0.003128
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001280 | Grad Max: 0.002380
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026908 | Grad Max: 0.026908
[GRADIENT NORM TOTAL] 9.0199

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.635
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50287426 0.4971258 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.061
[MASKS] A(Pass/Fail): 662/1386 | B: 607/1441 | C: 300/1076
[LOSS Ex1] A: 0.65129 | B: 0.64095 | C: 0.63446
[LOGITS Ex2 A] Mean Abs: 1.978 | Max: 5.956
[LOSS Ex2] A: 0.12303 | B: 0.36451 | C: 0.27322
** [JOINT LOSS] ** : 0.895821
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002997 | Grad Max: 0.073089
  -> Layer: shared_layers.0.bias | Grad Mean: 0.109426 | Grad Max: 0.487654
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.006608
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011286 | Grad Max: 0.011286
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000772 | Grad Max: 0.390631
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013331 | Grad Max: 2.184359
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003922
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002041 | Grad Max: 0.024002
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000186
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000323 | Grad Max: 0.002346
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000060
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000088 | Grad Max: 0.000457
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000382 | Grad Max: 0.000932
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000034 | Grad Max: 0.000034
[GRADIENT NORM TOTAL] 3.6679

[EPOCH SUMMARY] Train Loss: 0.8956

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8802 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 108/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.564
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5416047 0.4583953] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.060
[MASKS] A(Pass/Fail): 662/1386 | B: 608/1440 | C: 447/1601
[LOSS Ex1] A: 0.65120 | B: 0.63678 | C: 0.63528
[LOGITS Ex2 A] Mean Abs: 1.976 | Max: 5.833
[LOSS Ex2] A: 0.14540 | B: 0.35122 | C: 0.26691
** [JOINT LOSS] ** : 0.895600
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003119 | Grad Max: 0.126220
  -> Layer: shared_layers.0.bias | Grad Mean: 0.313670 | Grad Max: 1.502957
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.006071
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004816 | Grad Max: 0.004816
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.275892
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037854 | Grad Max: 1.536287
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000307 | Grad Max: 0.014235
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019232 | Grad Max: 0.144328
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000454
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003936 | Grad Max: 0.009263
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000191
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001026 | Grad Max: 0.002431
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000896 | Grad Max: 0.002045
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019762 | Grad Max: 0.019762
[GRADIENT NORM TOTAL] 6.6807

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.716
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.71985805 0.28014192] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.063
[MASKS] A(Pass/Fail): 712/1336 | B: 570/1286 | C: 461/1587
[LOSS Ex1] A: 0.64580 | B: 0.64092 | C: 0.63467
[LOGITS Ex2 A] Mean Abs: 2.019 | Max: 6.337
[LOSS Ex2] A: 0.13825 | B: 0.33835 | C: 0.24722
** [JOINT LOSS] ** : 0.881734
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003096 | Grad Max: 0.089401
  -> Layer: shared_layers.0.bias | Grad Mean: 0.180879 | Grad Max: 0.858002
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.006212
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006989 | Grad Max: 0.006989
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001201 | Grad Max: 0.231524
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022128 | Grad Max: 1.290830
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.006822
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010349 | Grad Max: 0.055254
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000268
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002219 | Grad Max: 0.005354
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000121
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000611 | Grad Max: 0.001514
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000600 | Grad Max: 0.001950
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012782 | Grad Max: 0.012782
[GRADIENT NORM TOTAL] 4.0782

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.797
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009278 0.4990722] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.062
[MASKS] A(Pass/Fail): 699/1349 | B: 608/1440 | C: 441/1607
[LOSS Ex1] A: 0.65189 | B: 0.63999 | C: 0.63521
[LOGITS Ex2 A] Mean Abs: 1.974 | Max: 6.484
[LOSS Ex2] A: 0.13849 | B: 0.37502 | C: 0.24820
** [JOINT LOSS] ** : 0.896270
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006224 | Grad Max: 0.201217
  -> Layer: shared_layers.0.bias | Grad Mean: 0.329595 | Grad Max: 1.366487
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005630
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000037 | Grad Max: 0.000037
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002223 | Grad Max: 0.274556
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040259 | Grad Max: 1.479585
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000348 | Grad Max: 0.011182
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021288 | Grad Max: 0.107245
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000459
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004692 | Grad Max: 0.009699
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000225
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001246 | Grad Max: 0.002921
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001170 | Grad Max: 0.002320
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024437 | Grad Max: 0.024437
[GRADIENT NORM TOTAL] 6.4096

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.546
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6756378  0.32436216] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.063
[MASKS] A(Pass/Fail): 683/1365 | B: 607/1441 | C: 470/1578
[LOSS Ex1] A: 0.64819 | B: 0.64076 | C: 0.63380
[LOGITS Ex2 A] Mean Abs: 1.975 | Max: 6.560
[LOSS Ex2] A: 0.15112 | B: 0.37619 | C: 0.24735
** [JOINT LOSS] ** : 0.899139
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007019 | Grad Max: 0.180353
  -> Layer: shared_layers.0.bias | Grad Mean: 0.400353 | Grad Max: 1.634655
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.006540
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007781 | Grad Max: 0.007781
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002512 | Grad Max: 0.258309
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046562 | Grad Max: 1.333758
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.015328
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025769 | Grad Max: 0.141154
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000588
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005530 | Grad Max: 0.011235
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000288
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001465 | Grad Max: 0.003447
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001456 | Grad Max: 0.002825
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029282 | Grad Max: 0.029282
[GRADIENT NORM TOTAL] 7.3534

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.636
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6088483 0.3911517] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.063
[MASKS] A(Pass/Fail): 578/1038 | B: 608/1440 | C: 433/1615
[LOSS Ex1] A: 0.64656 | B: 0.63659 | C: 0.63922
[LOGITS Ex2 A] Mean Abs: 2.048 | Max: 7.499
[LOSS Ex2] A: 0.13732 | B: 0.34665 | C: 0.26778
** [JOINT LOSS] ** : 0.891373
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002280 | Grad Max: 0.120304
  -> Layer: shared_layers.0.bias | Grad Mean: 0.248335 | Grad Max: 1.447143
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006319
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007277 | Grad Max: 0.007277
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001563 | Grad Max: 0.262363
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028639 | Grad Max: 1.472456
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000195 | Grad Max: 0.007874
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012202 | Grad Max: 0.065659
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000304
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002532 | Grad Max: 0.005981
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000133
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000668 | Grad Max: 0.001589
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000540 | Grad Max: 0.001681
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012818 | Grad Max: 0.012818
[GRADIENT NORM TOTAL] 5.5896

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.798
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071631 0.4928369] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.063
[MASKS] A(Pass/Fail): 695/1353 | B: 570/1286 | C: 487/1561
[LOSS Ex1] A: 0.64707 | B: 0.64074 | C: 0.63576
[LOGITS Ex2 A] Mean Abs: 2.045 | Max: 8.265
[LOSS Ex2] A: 0.12930 | B: 0.33782 | C: 0.28932
** [JOINT LOSS] ** : 0.893341
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004377 | Grad Max: 0.115427
  -> Layer: shared_layers.0.bias | Grad Mean: 0.300145 | Grad Max: 1.398243
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002236 | Grad Max: 0.006122
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007924 | Grad Max: 0.007924
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.304767
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037394 | Grad Max: 1.696210
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000296 | Grad Max: 0.010011
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018200 | Grad Max: 0.097177
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000418
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003889 | Grad Max: 0.008675
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000215
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001036 | Grad Max: 0.002643
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001021 | Grad Max: 0.002392
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021114 | Grad Max: 0.021114
[GRADIENT NORM TOTAL] 6.4336

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.750
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.510954   0.48904595] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.063
[MASKS] A(Pass/Fail): 690/1358 | B: 609/1439 | C: 474/1574
[LOSS Ex1] A: 0.64430 | B: 0.63982 | C: 0.63107
[LOGITS Ex2 A] Mean Abs: 2.015 | Max: 5.590
[LOSS Ex2] A: 0.13603 | B: 0.36484 | C: 0.26697
** [JOINT LOSS] ** : 0.894342
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002866 | Grad Max: 0.065265
  -> Layer: shared_layers.0.bias | Grad Mean: 0.084908 | Grad Max: 0.441308
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002352 | Grad Max: 0.006653
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006544 | Grad Max: 0.006544
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000765 | Grad Max: 0.193133
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013247 | Grad Max: 1.081777
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.003220
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005052 | Grad Max: 0.025503
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000189
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001132 | Grad Max: 0.003452
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000085
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000293 | Grad Max: 0.001002
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000331 | Grad Max: 0.001106
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006141 | Grad Max: 0.006141
[GRADIENT NORM TOTAL] 2.4173

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.774
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5031857  0.49681428] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.063
[MASKS] A(Pass/Fail): 684/1364 | B: 607/1441 | C: 473/1575
[LOSS Ex1] A: 0.64331 | B: 0.64059 | C: 0.63225
[LOGITS Ex2 A] Mean Abs: 1.973 | Max: 6.455
[LOSS Ex2] A: 0.15721 | B: 0.38250 | C: 0.26074
** [JOINT LOSS] ** : 0.905535
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005656 | Grad Max: 0.184304
  -> Layer: shared_layers.0.bias | Grad Mean: 0.509514 | Grad Max: 2.418754
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002270 | Grad Max: 0.006347
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000939 | Grad Max: 0.000939
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003101 | Grad Max: 0.535005
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058024 | Grad Max: 2.952957
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000463 | Grad Max: 0.014967
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028946 | Grad Max: 0.150901
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000618
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006014 | Grad Max: 0.012597
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000292
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001587 | Grad Max: 0.003527
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001491 | Grad Max: 0.002569
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031495 | Grad Max: 0.031495
[GRADIENT NORM TOTAL] 10.5124

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.640
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5028316  0.49716848] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.062
[MASKS] A(Pass/Fail): 663/1385 | B: 608/1440 | C: 489/1559
[LOSS Ex1] A: 0.65106 | B: 0.63641 | C: 0.63177
[LOGITS Ex2 A] Mean Abs: 1.929 | Max: 6.682
[LOSS Ex2] A: 0.14364 | B: 0.36640 | C: 0.25972
** [JOINT LOSS] ** : 0.896335
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007240 | Grad Max: 0.176619
  -> Layer: shared_layers.0.bias | Grad Mean: 0.572778 | Grad Max: 2.490349
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006045
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006523 | Grad Max: 0.006523
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003748 | Grad Max: 0.422524
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070169 | Grad Max: 2.383838
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000584 | Grad Max: 0.018924
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036568 | Grad Max: 0.197122
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000817
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007763 | Grad Max: 0.017055
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000394
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002056 | Grad Max: 0.004858
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001964 | Grad Max: 0.003385
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040706 | Grad Max: 0.040706
[GRADIENT NORM TOTAL] 11.7107

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.568
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54179025 0.45820972] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.548 | Std: 0.060
[MASKS] A(Pass/Fail): 662/1386 | B: 570/1286 | C: 462/1586
[LOSS Ex1] A: 0.65098 | B: 0.64057 | C: 0.63488
[LOGITS Ex2 A] Mean Abs: 1.941 | Max: 7.488
[LOSS Ex2] A: 0.14105 | B: 0.34496 | C: 0.25891
** [JOINT LOSS] ** : 0.890446
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003196 | Grad Max: 0.076776
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207932 | Grad Max: 0.956079
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.005653
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000403 | Grad Max: 0.000403
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001350 | Grad Max: 0.232827
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024880 | Grad Max: 1.307485
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000214 | Grad Max: 0.009438
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013276 | Grad Max: 0.085765
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000341
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002800 | Grad Max: 0.006401
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000177
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000736 | Grad Max: 0.002040
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000686 | Grad Max: 0.001930
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014381 | Grad Max: 0.014381
[GRADIENT NORM TOTAL] 4.2408

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.721
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7211653  0.27883464] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.063
[MASKS] A(Pass/Fail): 712/1336 | B: 609/1439 | C: 476/1572
[LOSS Ex1] A: 0.64555 | B: 0.63964 | C: 0.63305
[LOGITS Ex2 A] Mean Abs: 2.065 | Max: 6.254
[LOSS Ex2] A: 0.14648 | B: 0.36853 | C: 0.29700
** [JOINT LOSS] ** : 0.910086
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007885 | Grad Max: 0.205686
  -> Layer: shared_layers.0.bias | Grad Mean: 0.584902 | Grad Max: 2.587913
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002247 | Grad Max: 0.005792
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002407 | Grad Max: 0.002407
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003764 | Grad Max: 0.426963
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069852 | Grad Max: 2.324125
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000547 | Grad Max: 0.019074
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034167 | Grad Max: 0.194871
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000690
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007239 | Grad Max: 0.014610
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000334
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001871 | Grad Max: 0.004285
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001735 | Grad Max: 0.003123
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035516 | Grad Max: 0.035516
[GRADIENT NORM TOTAL] 11.8424

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.801
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009626 0.4990374] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.062
[MASKS] A(Pass/Fail): 699/1349 | B: 607/1441 | C: 457/1591
[LOSS Ex1] A: 0.65168 | B: 0.64042 | C: 0.63544
[LOGITS Ex2 A] Mean Abs: 2.082 | Max: 6.105
[LOSS Ex2] A: 0.14158 | B: 0.39517 | C: 0.29010
** [JOINT LOSS] ** : 0.918128
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008829 | Grad Max: 0.314288
  -> Layer: shared_layers.0.bias | Grad Mean: 0.899708 | Grad Max: 4.253185
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.005510
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003792 | Grad Max: 0.003792
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005724 | Grad Max: 0.634322
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.107226 | Grad Max: 3.591946
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000863 | Grad Max: 0.033421
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.054270 | Grad Max: 0.324101
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000105 | Grad Max: 0.001126
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011360 | Grad Max: 0.022736
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000496
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002976 | Grad Max: 0.006812
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002701 | Grad Max: 0.004601
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057296 | Grad Max: 0.057296
[GRADIENT NORM TOTAL] 18.6740

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.549
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6765464  0.32345363] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.063
[MASKS] A(Pass/Fail): 684/1364 | B: 608/1440 | C: 427/1621
[LOSS Ex1] A: 0.64797 | B: 0.63625 | C: 0.63629
[LOGITS Ex2 A] Mean Abs: 2.064 | Max: 6.247
[LOSS Ex2] A: 0.16054 | B: 0.36157 | C: 0.29683
** [JOINT LOSS] ** : 0.913151
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006997 | Grad Max: 0.223177
  -> Layer: shared_layers.0.bias | Grad Mean: 0.680833 | Grad Max: 2.852823
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.006158
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000879 | Grad Max: 0.000879
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004273 | Grad Max: 0.500297
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.080234 | Grad Max: 2.826247
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000671 | Grad Max: 0.024937
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.042353 | Grad Max: 0.251689
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.000862
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008805 | Grad Max: 0.018374
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000396
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002311 | Grad Max: 0.005367
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002057 | Grad Max: 0.003616
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044062 | Grad Max: 0.044062
[GRADIENT NORM TOTAL] 13.8491

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.641
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6093364 0.3906636] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.063
[MASKS] A(Pass/Fail): 578/1038 | B: 570/1286 | C: 299/1077
[LOSS Ex1] A: 0.64634 | B: 0.64041 | C: 0.63322
[LOGITS Ex2 A] Mean Abs: 2.050 | Max: 5.807
[LOSS Ex2] A: 0.13632 | B: 0.34107 | C: 0.25672
** [JOINT LOSS] ** : 0.884692
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001700 | Grad Max: 0.025605
  -> Layer: shared_layers.0.bias | Grad Mean: 0.077471 | Grad Max: 0.265646
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.006100
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002044 | Grad Max: 0.002044
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000649 | Grad Max: 0.348041
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011307 | Grad Max: 1.953935
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.003467
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002345 | Grad Max: 0.026262
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000172
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000425 | Grad Max: 0.002821
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000061
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000113 | Grad Max: 0.000514
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000405 | Grad Max: 0.001110
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001691 | Grad Max: 0.001691
[GRADIENT NORM TOTAL] 3.0918

[EPOCH SUMMARY] Train Loss: 0.8979

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8840 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 109/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.803
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071257 0.4928743] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 695/1353 | B: 609/1439 | C: 444/1604
[LOSS Ex1] A: 0.64686 | B: 0.63950 | C: 0.63707
[LOGITS Ex2 A] Mean Abs: 2.000 | Max: 6.311
[LOSS Ex2] A: 0.13694 | B: 0.39071 | C: 0.27925
** [JOINT LOSS] ** : 0.910110
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008388 | Grad Max: 0.241826
  -> Layer: shared_layers.0.bias | Grad Mean: 0.591426 | Grad Max: 2.685166
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005838
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003237 | Grad Max: 0.003237
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003775 | Grad Max: 0.463758
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070013 | Grad Max: 2.584737
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000550 | Grad Max: 0.018746
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034403 | Grad Max: 0.185875
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000735
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007352 | Grad Max: 0.015126
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000350
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001965 | Grad Max: 0.004511
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001874 | Grad Max: 0.003321
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039865 | Grad Max: 0.039865
[GRADIENT NORM TOTAL] 11.8331

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.755
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51096463 0.48903537] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.063
[MASKS] A(Pass/Fail): 691/1357 | B: 607/1441 | C: 449/1599
[LOSS Ex1] A: 0.64408 | B: 0.64028 | C: 0.63428
[LOGITS Ex2 A] Mean Abs: 1.982 | Max: 6.578
[LOSS Ex2] A: 0.13443 | B: 0.39042 | C: 0.26743
** [JOINT LOSS] ** : 0.903639
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007567 | Grad Max: 0.196971
  -> Layer: shared_layers.0.bias | Grad Mean: 0.623308 | Grad Max: 2.729717
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002370 | Grad Max: 0.006803
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011067 | Grad Max: 0.011067
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003920 | Grad Max: 0.487180
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.073128 | Grad Max: 2.758421
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000585 | Grad Max: 0.020431
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036886 | Grad Max: 0.204904
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000681
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007867 | Grad Max: 0.015479
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000385
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002080 | Grad Max: 0.005015
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001890 | Grad Max: 0.003649
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040556 | Grad Max: 0.040556
[GRADIENT NORM TOTAL] 12.4371

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.778
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5032143  0.49678567] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.063
[MASKS] A(Pass/Fail): 685/1363 | B: 608/1440 | C: 449/1599
[LOSS Ex1] A: 0.64310 | B: 0.63611 | C: 0.63380
[LOGITS Ex2 A] Mean Abs: 1.980 | Max: 6.685
[LOSS Ex2] A: 0.14603 | B: 0.35141 | C: 0.24968
** [JOINT LOSS] ** : 0.886707
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003536 | Grad Max: 0.095605
  -> Layer: shared_layers.0.bias | Grad Mean: 0.265313 | Grad Max: 0.961489
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002335 | Grad Max: 0.006120
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002133 | Grad Max: 0.002133
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001667 | Grad Max: 0.260715
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030723 | Grad Max: 1.461794
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000248 | Grad Max: 0.011682
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015387 | Grad Max: 0.113249
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000344
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003115 | Grad Max: 0.006534
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000162
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000806 | Grad Max: 0.002015
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000731 | Grad Max: 0.001841
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014944 | Grad Max: 0.014944
[GRADIENT NORM TOTAL] 5.3857

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.644
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5028035 0.4971965] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.062
[MASKS] A(Pass/Fail): 663/1385 | B: 570/1286 | C: 466/1582
[LOSS Ex1] A: 0.65088 | B: 0.64027 | C: 0.63512
[LOGITS Ex2 A] Mean Abs: 1.997 | Max: 5.486
[LOSS Ex2] A: 0.13985 | B: 0.34426 | C: 0.28503
** [JOINT LOSS] ** : 0.898468
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004274 | Grad Max: 0.143856
  -> Layer: shared_layers.0.bias | Grad Mean: 0.363221 | Grad Max: 1.782512
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.006240
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009565 | Grad Max: 0.009565
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.371051
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042964 | Grad Max: 2.073478
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000355 | Grad Max: 0.013300
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022351 | Grad Max: 0.121769
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000506
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004713 | Grad Max: 0.010713
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000236
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001251 | Grad Max: 0.002990
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001102 | Grad Max: 0.002323
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024163 | Grad Max: 0.024163
[GRADIENT NORM TOTAL] 7.7659

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.572
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5417897 0.4582103] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.060
[MASKS] A(Pass/Fail): 662/1386 | B: 609/1439 | C: 485/1563
[LOSS Ex1] A: 0.65081 | B: 0.63936 | C: 0.63334
[LOGITS Ex2 A] Mean Abs: 2.022 | Max: 5.930
[LOSS Ex2] A: 0.14745 | B: 0.37262 | C: 0.29537
** [JOINT LOSS] ** : 0.912983
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007328 | Grad Max: 0.232219
  -> Layer: shared_layers.0.bias | Grad Mean: 0.641800 | Grad Max: 3.028861
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005723
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005706 | Grad Max: 0.005706
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004093 | Grad Max: 0.473843
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.076659 | Grad Max: 2.650718
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000624 | Grad Max: 0.024488
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039427 | Grad Max: 0.225977
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000793
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008314 | Grad Max: 0.016510
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000397
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002184 | Grad Max: 0.005016
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001929 | Grad Max: 0.003473
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041303 | Grad Max: 0.041303
[GRADIENT NORM TOTAL] 13.1337

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.724
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7221304  0.27786958] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.063
[MASKS] A(Pass/Fail): 712/1336 | B: 607/1441 | C: 457/1591
[LOSS Ex1] A: 0.64537 | B: 0.64015 | C: 0.63456
[LOGITS Ex2 A] Mean Abs: 2.036 | Max: 6.734
[LOSS Ex2] A: 0.13752 | B: 0.36563 | C: 0.27635
** [JOINT LOSS] ** : 0.899857
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005916 | Grad Max: 0.150784
  -> Layer: shared_layers.0.bias | Grad Mean: 0.414319 | Grad Max: 1.523596
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.006422
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000874 | Grad Max: 0.000874
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002657 | Grad Max: 0.392569
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048958 | Grad Max: 2.196939
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.013311
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024434 | Grad Max: 0.127284
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000558
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005213 | Grad Max: 0.011032
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000245
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001357 | Grad Max: 0.003072
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001172 | Grad Max: 0.002416
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025014 | Grad Max: 0.025014
[GRADIENT NORM TOTAL] 8.4670

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.806
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50095206 0.49904794] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.062
[MASKS] A(Pass/Fail): 700/1348 | B: 608/1440 | C: 457/1591
[LOSS Ex1] A: 0.65151 | B: 0.63598 | C: 0.63732
[LOGITS Ex2 A] Mean Abs: 1.985 | Max: 6.312
[LOSS Ex2] A: 0.12831 | B: 0.35519 | C: 0.25453
** [JOINT LOSS] ** : 0.887613
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003934 | Grad Max: 0.126344
  -> Layer: shared_layers.0.bias | Grad Mean: 0.225344 | Grad Max: 0.948809
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.005935
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001857 | Grad Max: 0.001857
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001517 | Grad Max: 0.229924
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027597 | Grad Max: 1.291338
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.008889
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013808 | Grad Max: 0.080393
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000338
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002994 | Grad Max: 0.006649
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000156
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000783 | Grad Max: 0.002004
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000725 | Grad Max: 0.001961
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014786 | Grad Max: 0.014786
[GRADIENT NORM TOTAL] 4.7139

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.552
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.67724586 0.32275417] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.063
[MASKS] A(Pass/Fail): 684/1364 | B: 570/1286 | C: 434/1614
[LOSS Ex1] A: 0.64779 | B: 0.64015 | C: 0.63369
[LOGITS Ex2 A] Mean Abs: 1.970 | Max: 6.163
[LOSS Ex2] A: 0.15100 | B: 0.36494 | C: 0.27570
** [JOINT LOSS] ** : 0.904424
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005815 | Grad Max: 0.178007
  -> Layer: shared_layers.0.bias | Grad Mean: 0.388153 | Grad Max: 1.624420
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.006107
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003963 | Grad Max: 0.003963
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002446 | Grad Max: 0.373433
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045103 | Grad Max: 2.050425
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000376 | Grad Max: 0.013883
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023317 | Grad Max: 0.129764
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000485
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004947 | Grad Max: 0.010679
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000255
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001303 | Grad Max: 0.003233
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001141 | Grad Max: 0.002185
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024790 | Grad Max: 0.024790
[GRADIENT NORM TOTAL] 7.7401

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.644
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.609627   0.39037293] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.063
[MASKS] A(Pass/Fail): 579/1037 | B: 610/1438 | C: 489/1559
[LOSS Ex1] A: 0.64616 | B: 0.63924 | C: 0.63376
[LOGITS Ex2 A] Mean Abs: 2.015 | Max: 6.230
[LOSS Ex2] A: 0.13781 | B: 0.37023 | C: 0.24822
** [JOINT LOSS] ** : 0.891809
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.061111
  -> Layer: shared_layers.0.bias | Grad Mean: 0.111528 | Grad Max: 0.580055
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.006420
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000699 | Grad Max: 0.000699
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000867 | Grad Max: 0.230776
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015484 | Grad Max: 1.303083
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.004572
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006084 | Grad Max: 0.040072
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000197
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001255 | Grad Max: 0.003599
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000098
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000346 | Grad Max: 0.001001
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000389 | Grad Max: 0.001230
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007826 | Grad Max: 0.007826
[GRADIENT NORM TOTAL] 3.1290

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.807
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071226  0.49287742] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 696/1352 | B: 607/1441 | C: 468/1580
[LOSS Ex1] A: 0.64669 | B: 0.64002 | C: 0.63073
[LOGITS Ex2 A] Mean Abs: 2.049 | Max: 6.741
[LOSS Ex2] A: 0.13311 | B: 0.36744 | C: 0.25706
** [JOINT LOSS] ** : 0.891682
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006638 | Grad Max: 0.202962
  -> Layer: shared_layers.0.bias | Grad Mean: 0.537367 | Grad Max: 2.544029
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002279 | Grad Max: 0.006212
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005077 | Grad Max: 0.005077
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003480 | Grad Max: 0.492374
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065133 | Grad Max: 2.748162
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000494 | Grad Max: 0.018710
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031135 | Grad Max: 0.182088
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000640
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006625 | Grad Max: 0.013026
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000321
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001770 | Grad Max: 0.003925
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001632 | Grad Max: 0.003567
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035230 | Grad Max: 0.035230
[GRADIENT NORM TOTAL] 11.3716

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.758
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.510898   0.48910198] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.063
[MASKS] A(Pass/Fail): 691/1357 | B: 608/1440 | C: 491/1557
[LOSS Ex1] A: 0.64390 | B: 0.63585 | C: 0.62924
[LOGITS Ex2 A] Mean Abs: 2.048 | Max: 5.952
[LOSS Ex2] A: 0.14526 | B: 0.35819 | C: 0.26562
** [JOINT LOSS] ** : 0.892686
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008090 | Grad Max: 0.226417
  -> Layer: shared_layers.0.bias | Grad Mean: 0.624733 | Grad Max: 2.938836
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002392 | Grad Max: 0.006022
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002200 | Grad Max: 0.002200
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004007 | Grad Max: 0.471668
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074705 | Grad Max: 2.635655
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000592 | Grad Max: 0.021053
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037389 | Grad Max: 0.205060
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000761
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007959 | Grad Max: 0.016428
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000340
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002110 | Grad Max: 0.004714
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001902 | Grad Max: 0.003458
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041087 | Grad Max: 0.041087
[GRADIENT NORM TOTAL] 12.8878

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.782
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5032491  0.49675092] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.063
[MASKS] A(Pass/Fail): 685/1363 | B: 570/1286 | C: 451/1597
[LOSS Ex1] A: 0.64293 | B: 0.64002 | C: 0.63503
[LOGITS Ex2 A] Mean Abs: 1.996 | Max: 7.236
[LOSS Ex2] A: 0.15412 | B: 0.34918 | C: 0.28143
** [JOINT LOSS] ** : 0.900903
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005681 | Grad Max: 0.187685
  -> Layer: shared_layers.0.bias | Grad Mean: 0.258176 | Grad Max: 1.257170
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002236 | Grad Max: 0.006009
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002066 | Grad Max: 0.002066
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001796 | Grad Max: 0.275371
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032209 | Grad Max: 1.548232
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000261 | Grad Max: 0.010097
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016132 | Grad Max: 0.104013
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000388
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003535 | Grad Max: 0.007640
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000205
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000919 | Grad Max: 0.002156
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000753 | Grad Max: 0.001782
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016556 | Grad Max: 0.016556
[GRADIENT NORM TOTAL] 5.5991

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.647
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50276625 0.4972337 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.062
[MASKS] A(Pass/Fail): 664/1384 | B: 610/1438 | C: 482/1566
[LOSS Ex1] A: 0.65072 | B: 0.63912 | C: 0.63395
[LOGITS Ex2 A] Mean Abs: 1.927 | Max: 6.089
[LOSS Ex2] A: 0.13518 | B: 0.38421 | C: 0.26593
** [JOINT LOSS] ** : 0.903034
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005062 | Grad Max: 0.145214
  -> Layer: shared_layers.0.bias | Grad Mean: 0.479391 | Grad Max: 1.928931
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.005980
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007024 | Grad Max: 0.007024
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003099 | Grad Max: 0.352762
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057929 | Grad Max: 2.005474
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.017869
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030493 | Grad Max: 0.169060
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000655
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006473 | Grad Max: 0.013715
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000282
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001738 | Grad Max: 0.003866
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001632 | Grad Max: 0.003052
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034628 | Grad Max: 0.034628
[GRADIENT NORM TOTAL] 9.6529

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.574
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54187423 0.45812574] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.060
[MASKS] A(Pass/Fail): 664/1384 | B: 607/1441 | C: 300/1076
[LOSS Ex1] A: 0.65065 | B: 0.63990 | C: 0.63353
[LOGITS Ex2 A] Mean Abs: 1.879 | Max: 6.171
[LOSS Ex2] A: 0.15267 | B: 0.39556 | C: 0.27008
** [JOINT LOSS] ** : 0.914130
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008560 | Grad Max: 0.207235
  -> Layer: shared_layers.0.bias | Grad Mean: 0.672711 | Grad Max: 2.779246
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.006481
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010033 | Grad Max: 0.010033
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004256 | Grad Max: 0.451210
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.080048 | Grad Max: 2.521109
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000673 | Grad Max: 0.024841
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.042448 | Grad Max: 0.231237
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000859
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008941 | Grad Max: 0.017867
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000435
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002384 | Grad Max: 0.005305
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002163 | Grad Max: 0.003849
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046559 | Grad Max: 0.046559
[GRADIENT NORM TOTAL] 13.3719

[EPOCH SUMMARY] Train Loss: 0.8999

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8789 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8797 -> New: 0.8789)

############################## EPOCH 110/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.728
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7230741 0.2769259] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.064
[MASKS] A(Pass/Fail): 713/1335 | B: 608/1440 | C: 451/1597
[LOSS Ex1] A: 0.64519 | B: 0.63573 | C: 0.63711
[LOGITS Ex2 A] Mean Abs: 1.954 | Max: 6.065
[LOSS Ex2] A: 0.13296 | B: 0.35433 | C: 0.27847
** [JOINT LOSS] ** : 0.894597
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003585 | Grad Max: 0.107511
  -> Layer: shared_layers.0.bias | Grad Mean: 0.331384 | Grad Max: 1.428852
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.005792
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000318 | Grad Max: 0.000318
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.519320
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038572 | Grad Max: 2.883473
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000307 | Grad Max: 0.010803
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019454 | Grad Max: 0.098846
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000445
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004140 | Grad Max: 0.009386
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000213
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001107 | Grad Max: 0.002795
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000967 | Grad Max: 0.002200
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021495 | Grad Max: 0.021495
[GRADIENT NORM TOTAL] 7.1900

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.809
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50095963 0.4990404 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.063
[MASKS] A(Pass/Fail): 700/1348 | B: 570/1286 | C: 457/1591
[LOSS Ex1] A: 0.65136 | B: 0.63991 | C: 0.63514
[LOGITS Ex2 A] Mean Abs: 2.012 | Max: 5.633
[LOSS Ex2] A: 0.13149 | B: 0.33780 | C: 0.25427
** [JOINT LOSS] ** : 0.883322
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002836 | Grad Max: 0.120675
  -> Layer: shared_layers.0.bias | Grad Mean: 0.289602 | Grad Max: 1.379561
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.005457
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001655 | Grad Max: 0.001655
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001929 | Grad Max: 0.264667
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035486 | Grad Max: 1.480762
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000276 | Grad Max: 0.009799
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017407 | Grad Max: 0.098196
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000367
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003541 | Grad Max: 0.007504
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000167
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000961 | Grad Max: 0.002184
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000866 | Grad Max: 0.002330
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019745 | Grad Max: 0.019745
[GRADIENT NORM TOTAL] 6.2958

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.555
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6779082 0.3220918] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.064
[MASKS] A(Pass/Fail): 685/1363 | B: 610/1438 | C: 461/1587
[LOSS Ex1] A: 0.64762 | B: 0.63901 | C: 0.63270
[LOGITS Ex2 A] Mean Abs: 2.006 | Max: 6.215
[LOSS Ex2] A: 0.15601 | B: 0.36427 | C: 0.27143
** [JOINT LOSS] ** : 0.903679
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004660 | Grad Max: 0.180078
  -> Layer: shared_layers.0.bias | Grad Mean: 0.475921 | Grad Max: 2.200845
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006251
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001513 | Grad Max: 0.001513
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002953 | Grad Max: 0.341563
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054857 | Grad Max: 1.925379
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000468 | Grad Max: 0.017789
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029750 | Grad Max: 0.169325
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000579
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006233 | Grad Max: 0.012463
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000293
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001676 | Grad Max: 0.003963
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001514 | Grad Max: 0.003112
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033458 | Grad Max: 0.033458
[GRADIENT NORM TOTAL] 9.5248

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.647
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61001176 0.3899882 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.064
[MASKS] A(Pass/Fail): 579/1037 | B: 607/1441 | C: 466/1582
[LOSS Ex1] A: 0.64599 | B: 0.63980 | C: 0.63414
[LOGITS Ex2 A] Mean Abs: 2.010 | Max: 6.268
[LOSS Ex2] A: 0.12863 | B: 0.36448 | C: 0.26701
** [JOINT LOSS] ** : 0.893345
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002554 | Grad Max: 0.078741
  -> Layer: shared_layers.0.bias | Grad Mean: 0.196105 | Grad Max: 0.975178
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.006595
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000388 | Grad Max: 0.000388
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001366 | Grad Max: 0.201521
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024816 | Grad Max: 1.107025
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000177 | Grad Max: 0.006903
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011298 | Grad Max: 0.064166
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000288
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002372 | Grad Max: 0.005601
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000129
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000625 | Grad Max: 0.001467
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000485 | Grad Max: 0.001588
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011166 | Grad Max: 0.011166
[GRADIENT NORM TOTAL] 4.3118

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.811
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071314  0.49286854] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 696/1352 | B: 608/1440 | C: 480/1568
[LOSS Ex1] A: 0.64652 | B: 0.63562 | C: 0.62950
[LOGITS Ex2 A] Mean Abs: 1.974 | Max: 7.122
[LOSS Ex2] A: 0.14178 | B: 0.35594 | C: 0.23169
** [JOINT LOSS] ** : 0.880352
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006597 | Grad Max: 0.179681
  -> Layer: shared_layers.0.bias | Grad Mean: 0.437284 | Grad Max: 1.730302
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.005965
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002244 | Grad Max: 0.002244
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002746 | Grad Max: 0.294776
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051210 | Grad Max: 1.669751
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000427 | Grad Max: 0.014904
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026922 | Grad Max: 0.145342
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000565
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005685 | Grad Max: 0.011527
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000311
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001477 | Grad Max: 0.004157
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001281 | Grad Max: 0.002939
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026915 | Grad Max: 0.026915
[GRADIENT NORM TOTAL] 8.4993

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.761
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51089954 0.48910046] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 691/1357 | B: 570/1286 | C: 459/1589
[LOSS Ex1] A: 0.64373 | B: 0.63981 | C: 0.63341
[LOGITS Ex2 A] Mean Abs: 1.943 | Max: 6.098
[LOSS Ex2] A: 0.13733 | B: 0.36874 | C: 0.25994
** [JOINT LOSS] ** : 0.894320
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004923 | Grad Max: 0.152986
  -> Layer: shared_layers.0.bias | Grad Mean: 0.499393 | Grad Max: 2.193295
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.006473
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007127 | Grad Max: 0.007127
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003031 | Grad Max: 0.337489
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056944 | Grad Max: 1.896135
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000464 | Grad Max: 0.019524
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029505 | Grad Max: 0.189991
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000618
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006177 | Grad Max: 0.012400
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000288
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001642 | Grad Max: 0.003990
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001394 | Grad Max: 0.002740
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030995 | Grad Max: 0.030995
[GRADIENT NORM TOTAL] 9.9740

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.785
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5032697  0.49673027] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.063
[MASKS] A(Pass/Fail): 685/1363 | B: 610/1438 | C: 483/1565
[LOSS Ex1] A: 0.64275 | B: 0.63891 | C: 0.63020
[LOGITS Ex2 A] Mean Abs: 1.942 | Max: 6.604
[LOSS Ex2] A: 0.15510 | B: 0.36935 | C: 0.26332
** [JOINT LOSS] ** : 0.899880
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003649 | Grad Max: 0.107519
  -> Layer: shared_layers.0.bias | Grad Mean: 0.239502 | Grad Max: 1.356242
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002308 | Grad Max: 0.006751
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002661 | Grad Max: 0.002661
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001541 | Grad Max: 0.235423
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027037 | Grad Max: 1.313451
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000191 | Grad Max: 0.007971
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011866 | Grad Max: 0.077784
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000302
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002380 | Grad Max: 0.005563
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000139
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000640 | Grad Max: 0.001754
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000569 | Grad Max: 0.001493
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012713 | Grad Max: 0.012713
[GRADIENT NORM TOTAL] 4.9835

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.649
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5027327  0.49726734] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.549 | Std: 0.062
[MASKS] A(Pass/Fail): 664/1384 | B: 607/1441 | C: 468/1580
[LOSS Ex1] A: 0.65057 | B: 0.63969 | C: 0.63189
[LOGITS Ex2 A] Mean Abs: 1.978 | Max: 6.172
[LOSS Ex2] A: 0.13594 | B: 0.36308 | C: 0.26909
** [JOINT LOSS] ** : 0.896754
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006127 | Grad Max: 0.169527
  -> Layer: shared_layers.0.bias | Grad Mean: 0.450802 | Grad Max: 1.966999
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002114 | Grad Max: 0.006137
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009129 | Grad Max: 0.009129
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002846 | Grad Max: 0.410159
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052476 | Grad Max: 2.290655
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000404 | Grad Max: 0.015529
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025615 | Grad Max: 0.159686
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000516
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005497 | Grad Max: 0.011242
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000258
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001462 | Grad Max: 0.003346
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001210 | Grad Max: 0.002700
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027380 | Grad Max: 0.027380
[GRADIENT NORM TOTAL] 9.2608

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.577
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5419415  0.45805842] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.061
[MASKS] A(Pass/Fail): 664/1384 | B: 609/1439 | C: 447/1601
[LOSS Ex1] A: 0.65050 | B: 0.63551 | C: 0.63623
[LOGITS Ex2 A] Mean Abs: 1.975 | Max: 6.003
[LOSS Ex2] A: 0.14683 | B: 0.35251 | C: 0.29063
** [JOINT LOSS] ** : 0.904067
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007444 | Grad Max: 0.198781
  -> Layer: shared_layers.0.bias | Grad Mean: 0.611085 | Grad Max: 2.506172
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.006310
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008645 | Grad Max: 0.008645
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003930 | Grad Max: 0.495496
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.073148 | Grad Max: 2.766162
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000581 | Grad Max: 0.021963
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036868 | Grad Max: 0.208712
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000762
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007879 | Grad Max: 0.016302
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000339
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002105 | Grad Max: 0.004654
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001848 | Grad Max: 0.003701
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040425 | Grad Max: 0.040425
[GRADIENT NORM TOTAL] 12.4431

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.731
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.72392404 0.27607596] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.064
[MASKS] A(Pass/Fail): 713/1335 | B: 570/1286 | C: 464/1584
[LOSS Ex1] A: 0.64504 | B: 0.63970 | C: 0.63287
[LOGITS Ex2 A] Mean Abs: 1.993 | Max: 6.597
[LOSS Ex2] A: 0.13662 | B: 0.34398 | C: 0.27026
** [JOINT LOSS] ** : 0.889486
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005167 | Grad Max: 0.147728
  -> Layer: shared_layers.0.bias | Grad Mean: 0.356101 | Grad Max: 1.352393
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.006139
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000361 | Grad Max: 0.000361
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002344 | Grad Max: 0.324655
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042912 | Grad Max: 1.810015
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000330 | Grad Max: 0.013956
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020768 | Grad Max: 0.136469
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000489
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004543 | Grad Max: 0.009500
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000206
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001236 | Grad Max: 0.002712
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001157 | Grad Max: 0.002649
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024879 | Grad Max: 0.024879
[GRADIENT NORM TOTAL] 7.3188

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.813
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500969   0.49903095] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.063
[MASKS] A(Pass/Fail): 700/1348 | B: 610/1438 | C: 465/1583
[LOSS Ex1] A: 0.65121 | B: 0.63880 | C: 0.63186
[LOGITS Ex2 A] Mean Abs: 1.957 | Max: 6.219
[LOSS Ex2] A: 0.12514 | B: 0.37711 | C: 0.25767
** [JOINT LOSS] ** : 0.893930
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004791 | Grad Max: 0.129892
  -> Layer: shared_layers.0.bias | Grad Mean: 0.347710 | Grad Max: 1.445158
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005793
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000200 | Grad Max: 0.000200
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.336066
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039957 | Grad Max: 1.881503
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000316 | Grad Max: 0.010652
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020011 | Grad Max: 0.107299
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000469
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004328 | Grad Max: 0.009224
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000221
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001166 | Grad Max: 0.002809
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001067 | Grad Max: 0.002344
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022938 | Grad Max: 0.022938
[GRADIENT NORM TOTAL] 7.1866

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.557
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6785671  0.32143286] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.064
[MASKS] A(Pass/Fail): 685/1363 | B: 607/1441 | C: 462/1586
[LOSS Ex1] A: 0.64747 | B: 0.63959 | C: 0.63480
[LOGITS Ex2 A] Mean Abs: 1.938 | Max: 6.372
[LOSS Ex2] A: 0.14671 | B: 0.37949 | C: 0.26614
** [JOINT LOSS] ** : 0.904731
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007935 | Grad Max: 0.213343
  -> Layer: shared_layers.0.bias | Grad Mean: 0.527946 | Grad Max: 2.221864
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.005488
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003640 | Grad Max: 0.003640
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003317 | Grad Max: 0.470481
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.061513 | Grad Max: 2.570152
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000488 | Grad Max: 0.019150
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030877 | Grad Max: 0.180755
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000608
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006674 | Grad Max: 0.013177
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000322
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001803 | Grad Max: 0.004074
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001689 | Grad Max: 0.003007
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035677 | Grad Max: 0.035677
[GRADIENT NORM TOTAL] 10.3061

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.650
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6104063  0.38959375] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.064
[MASKS] A(Pass/Fail): 579/1037 | B: 609/1439 | C: 472/1576
[LOSS Ex1] A: 0.64584 | B: 0.63540 | C: 0.63704
[LOGITS Ex2 A] Mean Abs: 1.985 | Max: 5.608
[LOSS Ex2] A: 0.13653 | B: 0.35328 | C: 0.28652
** [JOINT LOSS] ** : 0.898204
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005352 | Grad Max: 0.171098
  -> Layer: shared_layers.0.bias | Grad Mean: 0.260640 | Grad Max: 0.992219
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.006644
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006080 | Grad Max: 0.006080
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001809 | Grad Max: 0.219955
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032820 | Grad Max: 1.221215
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000272 | Grad Max: 0.009316
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017013 | Grad Max: 0.090861
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000459
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003784 | Grad Max: 0.008426
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000173
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001047 | Grad Max: 0.002271
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001076 | Grad Max: 0.002201
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022025 | Grad Max: 0.022025
[GRADIENT NORM TOTAL] 5.2330

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.814
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071214 0.4928786] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.064
[MASKS] A(Pass/Fail): 696/1352 | B: 570/1286 | C: 308/1068
[LOSS Ex1] A: 0.64638 | B: 0.63959 | C: 0.63251
[LOGITS Ex2 A] Mean Abs: 2.007 | Max: 6.790
[LOSS Ex2] A: 0.14597 | B: 0.34377 | C: 0.26327
** [JOINT LOSS] ** : 0.890498
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003567 | Grad Max: 0.179041
  -> Layer: shared_layers.0.bias | Grad Mean: 0.426118 | Grad Max: 2.207880
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002256 | Grad Max: 0.006141
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006079 | Grad Max: 0.006079
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002648 | Grad Max: 0.353049
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049111 | Grad Max: 1.972742
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000377 | Grad Max: 0.014589
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024243 | Grad Max: 0.149414
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000513
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005097 | Grad Max: 0.010529
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000231
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001392 | Grad Max: 0.002890
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001251 | Grad Max: 0.002905
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028111 | Grad Max: 0.028111
[GRADIENT NORM TOTAL] 9.0322

[EPOCH SUMMARY] Train Loss: 0.8948

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8921 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 111/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.764
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51088405 0.48911592] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 691/1357 | B: 610/1438 | C: 480/1568
[LOSS Ex1] A: 0.64357 | B: 0.63869 | C: 0.63190
[LOGITS Ex2 A] Mean Abs: 2.018 | Max: 5.909
[LOSS Ex2] A: 0.14783 | B: 0.38548 | C: 0.27813
** [JOINT LOSS] ** : 0.908535
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009625 | Grad Max: 0.255507
  -> Layer: shared_layers.0.bias | Grad Mean: 0.721226 | Grad Max: 3.237003
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002286 | Grad Max: 0.006226
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004182 | Grad Max: 0.004182
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004510 | Grad Max: 0.575853
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.084558 | Grad Max: 3.201047
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000716 | Grad Max: 0.026963
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.045435 | Grad Max: 0.277940
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000970
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009641 | Grad Max: 0.020741
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000419
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002599 | Grad Max: 0.005697
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002353 | Grad Max: 0.004187
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050945 | Grad Max: 0.050945
[GRADIENT NORM TOTAL] 14.3447

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.788
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5032933  0.49670678] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 685/1363 | B: 607/1441 | C: 470/1578
[LOSS Ex1] A: 0.64260 | B: 0.63948 | C: 0.63718
[LOGITS Ex2 A] Mean Abs: 1.994 | Max: 5.948
[LOSS Ex2] A: 0.16199 | B: 0.36398 | C: 0.27408
** [JOINT LOSS] ** : 0.906436
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006729 | Grad Max: 0.204741
  -> Layer: shared_layers.0.bias | Grad Mean: 0.481005 | Grad Max: 2.135135
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.006433
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004505 | Grad Max: 0.004505
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003137 | Grad Max: 0.362131
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057665 | Grad Max: 2.034848
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000486 | Grad Max: 0.017587
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030989 | Grad Max: 0.177044
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000651
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006729 | Grad Max: 0.013501
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000316
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001809 | Grad Max: 0.004250
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001537 | Grad Max: 0.003103
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034140 | Grad Max: 0.034140
[GRADIENT NORM TOTAL] 9.7041

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.652
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5027103  0.49728975] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.063
[MASKS] A(Pass/Fail): 664/1384 | B: 609/1439 | C: 481/1567
[LOSS Ex1] A: 0.65043 | B: 0.63530 | C: 0.63098
[LOGITS Ex2 A] Mean Abs: 1.926 | Max: 6.338
[LOSS Ex2] A: 0.13853 | B: 0.33685 | C: 0.25418
** [JOINT LOSS] ** : 0.882088
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002845 | Grad Max: 0.080772
  -> Layer: shared_layers.0.bias | Grad Mean: 0.239408 | Grad Max: 1.039432
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.006943
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012597 | Grad Max: 0.012597
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001533 | Grad Max: 0.462363
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028600 | Grad Max: 2.575244
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000220 | Grad Max: 0.008477
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013924 | Grad Max: 0.079916
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000365
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002913 | Grad Max: 0.007095
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000171
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000773 | Grad Max: 0.002119
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000705 | Grad Max: 0.001999
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014804 | Grad Max: 0.014804
[GRADIENT NORM TOTAL] 5.6078

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.579
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5420046  0.45799538] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.061
[MASKS] A(Pass/Fail): 664/1384 | B: 570/1286 | C: 442/1606
[LOSS Ex1] A: 0.65037 | B: 0.63949 | C: 0.63476
[LOGITS Ex2 A] Mean Abs: 1.870 | Max: 6.342
[LOSS Ex2] A: 0.14408 | B: 0.36341 | C: 0.27349
** [JOINT LOSS] ** : 0.901868
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003719 | Grad Max: 0.155521
  -> Layer: shared_layers.0.bias | Grad Mean: 0.456565 | Grad Max: 2.118744
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.006287
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007732 | Grad Max: 0.007732
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002818 | Grad Max: 0.326501
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052901 | Grad Max: 1.802566
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000424 | Grad Max: 0.017430
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027253 | Grad Max: 0.152804
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000513
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005741 | Grad Max: 0.011432
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000271
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001555 | Grad Max: 0.003666
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001275 | Grad Max: 0.002607
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029695 | Grad Max: 0.029695
[GRADIENT NORM TOTAL] 9.3090

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.734
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7246984  0.27530158] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.064
[MASKS] A(Pass/Fail): 714/1334 | B: 611/1437 | C: 476/1572
[LOSS Ex1] A: 0.64489 | B: 0.63859 | C: 0.63645
[LOGITS Ex2 A] Mean Abs: 1.940 | Max: 5.894
[LOSS Ex2] A: 0.13820 | B: 0.37164 | C: 0.26059
** [JOINT LOSS] ** : 0.896787
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003202 | Grad Max: 0.112500
  -> Layer: shared_layers.0.bias | Grad Mean: 0.282837 | Grad Max: 1.451387
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.006303
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002889 | Grad Max: 0.002889
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001740 | Grad Max: 0.261353
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031532 | Grad Max: 1.455544
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.012486
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014570 | Grad Max: 0.102527
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000356
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003009 | Grad Max: 0.006588
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000141
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000824 | Grad Max: 0.001912
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000712 | Grad Max: 0.001765
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016579 | Grad Max: 0.016579
[GRADIENT NORM TOTAL] 5.8619

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.816
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50094986 0.49905017] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.063
[MASKS] A(Pass/Fail): 700/1348 | B: 607/1441 | C: 429/1619
[LOSS Ex1] A: 0.65108 | B: 0.63938 | C: 0.63489
[LOGITS Ex2 A] Mean Abs: 1.971 | Max: 5.999
[LOSS Ex2] A: 0.13091 | B: 0.36461 | C: 0.25351
** [JOINT LOSS] ** : 0.891464
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003958 | Grad Max: 0.135044
  -> Layer: shared_layers.0.bias | Grad Mean: 0.339356 | Grad Max: 1.668382
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.005665
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005666 | Grad Max: 0.005666
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.331919
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040105 | Grad Max: 1.845672
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.011279
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019853 | Grad Max: 0.112071
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000404
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004218 | Grad Max: 0.008643
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000211
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001152 | Grad Max: 0.002587
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001019 | Grad Max: 0.002718
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022894 | Grad Max: 0.022894
[GRADIENT NORM TOTAL] 7.3015

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.559
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6790944 0.3209057] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.064
[MASKS] A(Pass/Fail): 685/1363 | B: 609/1439 | C: 477/1571
[LOSS Ex1] A: 0.64733 | B: 0.63520 | C: 0.62957
[LOGITS Ex2 A] Mean Abs: 1.984 | Max: 5.633
[LOSS Ex2] A: 0.15164 | B: 0.34770 | C: 0.28125
** [JOINT LOSS] ** : 0.897560
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004880 | Grad Max: 0.196758
  -> Layer: shared_layers.0.bias | Grad Mean: 0.467317 | Grad Max: 2.406163
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002280 | Grad Max: 0.005847
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000131 | Grad Max: 0.000131
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002904 | Grad Max: 0.483871
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054227 | Grad Max: 2.705593
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000420 | Grad Max: 0.015271
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026866 | Grad Max: 0.153475
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000606
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005655 | Grad Max: 0.012051
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000291
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001531 | Grad Max: 0.003694
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001423 | Grad Max: 0.002977
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030999 | Grad Max: 0.030999
[GRADIENT NORM TOTAL] 9.9943

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.653
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6106794  0.38932064] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.064
[MASKS] A(Pass/Fail): 580/1036 | B: 570/1286 | C: 443/1605
[LOSS Ex1] A: 0.64569 | B: 0.63939 | C: 0.63524
[LOGITS Ex2 A] Mean Abs: 1.995 | Max: 5.806
[LOSS Ex2] A: 0.13203 | B: 0.33731 | C: 0.26325
** [JOINT LOSS] ** : 0.884306
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001733 | Grad Max: 0.036179
  -> Layer: shared_layers.0.bias | Grad Mean: 0.096690 | Grad Max: 0.375981
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.006398
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009293 | Grad Max: 0.009293
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000691 | Grad Max: 0.153999
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012100 | Grad Max: 0.859100
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000067 | Grad Max: 0.003797
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003907 | Grad Max: 0.029519
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000207
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000825 | Grad Max: 0.003284
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000071
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000225 | Grad Max: 0.000770
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000491 | Grad Max: 0.001268
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004058 | Grad Max: 0.004058
[GRADIENT NORM TOTAL] 2.4958

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.816
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071514  0.49284855] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.065
[MASKS] A(Pass/Fail): 696/1352 | B: 612/1436 | C: 451/1597
[LOSS Ex1] A: 0.64623 | B: 0.63849 | C: 0.63573
[LOGITS Ex2 A] Mean Abs: 1.953 | Max: 7.965
[LOSS Ex2] A: 0.13166 | B: 0.37455 | C: 0.29081
** [JOINT LOSS] ** : 0.905823
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004943 | Grad Max: 0.133292
  -> Layer: shared_layers.0.bias | Grad Mean: 0.380450 | Grad Max: 1.735773
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.005838
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000027 | Grad Max: 0.000027
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002453 | Grad Max: 0.388551
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045555 | Grad Max: 2.201898
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000371 | Grad Max: 0.014613
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023670 | Grad Max: 0.151248
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000484
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005075 | Grad Max: 0.010361
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000266
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001400 | Grad Max: 0.003334
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001317 | Grad Max: 0.002517
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028701 | Grad Max: 0.028701
[GRADIENT NORM TOTAL] 8.0235

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.767
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108718  0.48912823] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 691/1357 | B: 607/1441 | C: 476/1572
[LOSS Ex1] A: 0.64341 | B: 0.63927 | C: 0.63174
[LOGITS Ex2 A] Mean Abs: 1.959 | Max: 5.904
[LOSS Ex2] A: 0.13682 | B: 0.37096 | C: 0.25022
** [JOINT LOSS] ** : 0.890808
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004694 | Grad Max: 0.128897
  -> Layer: shared_layers.0.bias | Grad Mean: 0.378737 | Grad Max: 1.735713
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002245 | Grad Max: 0.006334
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001403 | Grad Max: 0.001403
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002419 | Grad Max: 0.324462
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045003 | Grad Max: 1.837222
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000373 | Grad Max: 0.012402
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023723 | Grad Max: 0.111841
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000516
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005092 | Grad Max: 0.010444
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000236
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001376 | Grad Max: 0.003200
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001203 | Grad Max: 0.002495
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026488 | Grad Max: 0.026488
[GRADIENT NORM TOTAL] 7.6823

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.791
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.503335   0.49666503] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 686/1362 | B: 610/1438 | C: 487/1561
[LOSS Ex1] A: 0.64243 | B: 0.63508 | C: 0.62983
[LOGITS Ex2 A] Mean Abs: 1.948 | Max: 8.153
[LOSS Ex2] A: 0.14764 | B: 0.33922 | C: 0.25462
** [JOINT LOSS] ** : 0.882939
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002557 | Grad Max: 0.111608
  -> Layer: shared_layers.0.bias | Grad Mean: 0.068875 | Grad Max: 0.334585
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002364 | Grad Max: 0.006516
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003383 | Grad Max: 0.003383
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000727 | Grad Max: 0.141815
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012092 | Grad Max: 0.790715
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000089 | Grad Max: 0.004432
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005039 | Grad Max: 0.026763
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000167
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001151 | Grad Max: 0.003682
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000129
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000318 | Grad Max: 0.001076
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000427 | Grad Max: 0.001312
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006548 | Grad Max: 0.006548
[GRADIENT NORM TOTAL] 1.9522

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.655
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50270396 0.49729607] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.063
[MASKS] A(Pass/Fail): 665/1383 | B: 571/1285 | C: 455/1593
[LOSS Ex1] A: 0.65028 | B: 0.63926 | C: 0.63460
[LOGITS Ex2 A] Mean Abs: 1.938 | Max: 6.523
[LOSS Ex2] A: 0.13160 | B: 0.33711 | C: 0.26267
** [JOINT LOSS] ** : 0.885174
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001862 | Grad Max: 0.044508
  -> Layer: shared_layers.0.bias | Grad Mean: 0.051439 | Grad Max: 0.378444
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.005721
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000072 | Grad Max: 0.000072
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000473 | Grad Max: 0.113891
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.007753 | Grad Max: 0.615574
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.003144
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001652 | Grad Max: 0.016238
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000137
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000313 | Grad Max: 0.002285
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000102 | Grad Max: 0.000503
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000530 | Grad Max: 0.001092
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000108 | Grad Max: 0.000108
[GRADIENT NORM TOTAL] 1.6760

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.582
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54206556 0.45793444] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.061
[MASKS] A(Pass/Fail): 664/1384 | B: 613/1435 | C: 473/1575
[LOSS Ex1] A: 0.65021 | B: 0.63835 | C: 0.62861
[LOGITS Ex2 A] Mean Abs: 1.929 | Max: 5.617
[LOSS Ex2] A: 0.14302 | B: 0.36294 | C: 0.26471
** [JOINT LOSS] ** : 0.895944
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002303 | Grad Max: 0.072302
  -> Layer: shared_layers.0.bias | Grad Mean: 0.054322 | Grad Max: 0.236491
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006883
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010335 | Grad Max: 0.010335
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000572 | Grad Max: 0.152602
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009326 | Grad Max: 0.853370
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.002580
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001896 | Grad Max: 0.014788
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000113
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000343 | Grad Max: 0.002129
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000097 | Grad Max: 0.000511
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000316 | Grad Max: 0.000927
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001525 | Grad Max: 0.001525
[GRADIENT NORM TOTAL] 1.8476

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.737
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7256309 0.2743691] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.064
[MASKS] A(Pass/Fail): 716/1332 | B: 607/1441 | C: 326/1050
[LOSS Ex1] A: 0.64470 | B: 0.63911 | C: 0.63271
[LOGITS Ex2 A] Mean Abs: 1.976 | Max: 7.155
[LOSS Ex2] A: 0.13035 | B: 0.36530 | C: 0.26534
** [JOINT LOSS] ** : 0.892504
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003200 | Grad Max: 0.087522
  -> Layer: shared_layers.0.bias | Grad Mean: 0.156629 | Grad Max: 0.570005
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002304 | Grad Max: 0.006494
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009305 | Grad Max: 0.009305
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001173 | Grad Max: 0.181303
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020952 | Grad Max: 1.023808
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000155 | Grad Max: 0.008588
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009614 | Grad Max: 0.067635
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000263
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002088 | Grad Max: 0.005269
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000115
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000563 | Grad Max: 0.001377
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000484 | Grad Max: 0.001674
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010785 | Grad Max: 0.010785
[GRADIENT NORM TOTAL] 3.5957

[EPOCH SUMMARY] Train Loss: 0.8944

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8754 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8789 -> New: 0.8754)

############################## EPOCH 112/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.819
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009151  0.49908492] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.063
[MASKS] A(Pass/Fail): 700/1348 | B: 611/1437 | C: 464/1584
[LOSS Ex1] A: 0.65089 | B: 0.63489 | C: 0.63274
[LOGITS Ex2 A] Mean Abs: 1.994 | Max: 5.873
[LOSS Ex2] A: 0.13056 | B: 0.34979 | C: 0.25660
** [JOINT LOSS] ** : 0.885155
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002676 | Grad Max: 0.066260
  -> Layer: shared_layers.0.bias | Grad Mean: 0.190054 | Grad Max: 0.937510
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002220 | Grad Max: 0.005748
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003692 | Grad Max: 0.003692
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001199 | Grad Max: 0.279875
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022140 | Grad Max: 1.575784
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.008840
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010490 | Grad Max: 0.078916
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000269
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002207 | Grad Max: 0.005449
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000129
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000588 | Grad Max: 0.001473
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000557 | Grad Max: 0.001714
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011437 | Grad Max: 0.011437
[GRADIENT NORM TOTAL] 4.4394

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.563
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.67990094 0.32009903] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.064
[MASKS] A(Pass/Fail): 686/1362 | B: 571/1285 | C: 454/1594
[LOSS Ex1] A: 0.64711 | B: 0.63907 | C: 0.63883
[LOGITS Ex2 A] Mean Abs: 1.983 | Max: 6.300
[LOSS Ex2] A: 0.14796 | B: 0.34606 | C: 0.28613
** [JOINT LOSS] ** : 0.901714
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003450 | Grad Max: 0.071976
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207580 | Grad Max: 1.068628
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.005720
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001367 | Grad Max: 0.001367
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001394 | Grad Max: 0.253972
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025775 | Grad Max: 1.437107
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000216 | Grad Max: 0.007683
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013714 | Grad Max: 0.074424
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000324
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002851 | Grad Max: 0.006456
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000149
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000754 | Grad Max: 0.001831
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000649 | Grad Max: 0.001717
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013711 | Grad Max: 0.013711
[GRADIENT NORM TOTAL] 4.3821

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.657
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61107475 0.38892522] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.064
[MASKS] A(Pass/Fail): 580/1036 | B: 613/1435 | C: 466/1582
[LOSS Ex1] A: 0.64545 | B: 0.63815 | C: 0.63108
[LOGITS Ex2 A] Mean Abs: 2.038 | Max: 6.307
[LOSS Ex2] A: 0.13850 | B: 0.36246 | C: 0.25428
** [JOINT LOSS] ** : 0.889976
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003864 | Grad Max: 0.106781
  -> Layer: shared_layers.0.bias | Grad Mean: 0.295825 | Grad Max: 1.163207
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.006167
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004428 | Grad Max: 0.004428
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001963 | Grad Max: 0.267031
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036155 | Grad Max: 1.493388
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.011443
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018076 | Grad Max: 0.096691
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000433
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003917 | Grad Max: 0.008194
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000217
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001052 | Grad Max: 0.002664
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000937 | Grad Max: 0.002391
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020422 | Grad Max: 0.020422
[GRADIENT NORM TOTAL] 6.1395

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.821
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072226 0.4927774] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.065
[MASKS] A(Pass/Fail): 696/1352 | B: 608/1440 | C: 485/1563
[LOSS Ex1] A: 0.64598 | B: 0.63892 | C: 0.63083
[LOGITS Ex2 A] Mean Abs: 2.035 | Max: 5.923
[LOSS Ex2] A: 0.13027 | B: 0.36549 | C: 0.27201
** [JOINT LOSS] ** : 0.894502
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005251 | Grad Max: 0.131854
  -> Layer: shared_layers.0.bias | Grad Mean: 0.353632 | Grad Max: 1.641911
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002209 | Grad Max: 0.005574
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000996 | Grad Max: 0.000997
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002383 | Grad Max: 0.322319
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043547 | Grad Max: 1.787200
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000341 | Grad Max: 0.011886
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021614 | Grad Max: 0.130834
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000434
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004652 | Grad Max: 0.009499
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000229
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001244 | Grad Max: 0.003071
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001080 | Grad Max: 0.002379
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023258 | Grad Max: 0.023258
[GRADIENT NORM TOTAL] 7.5684

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.773
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51079583 0.48920414] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 691/1357 | B: 611/1437 | C: 491/1557
[LOSS Ex1] A: 0.64315 | B: 0.63470 | C: 0.63317
[LOGITS Ex2 A] Mean Abs: 2.001 | Max: 5.650
[LOSS Ex2] A: 0.13753 | B: 0.34404 | C: 0.26349
** [JOINT LOSS] ** : 0.885359
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001715 | Grad Max: 0.032374
  -> Layer: shared_layers.0.bias | Grad Mean: 0.065595 | Grad Max: 0.247916
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002324 | Grad Max: 0.006705
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005480 | Grad Max: 0.005480
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000630 | Grad Max: 0.153016
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010936 | Grad Max: 0.863790
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.004048
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003050 | Grad Max: 0.031550
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000189
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000620 | Grad Max: 0.002660
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000058
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000178 | Grad Max: 0.000656
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000333 | Grad Max: 0.001035
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003475 | Grad Max: 0.003475
[GRADIENT NORM TOTAL] 1.9719

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.797
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034577  0.49654227] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 688/1360 | B: 571/1285 | C: 482/1566
[LOSS Ex1] A: 0.64215 | B: 0.63888 | C: 0.63165
[LOGITS Ex2 A] Mean Abs: 1.997 | Max: 7.277
[LOSS Ex2] A: 0.15519 | B: 0.34744 | C: 0.24236
** [JOINT LOSS] ** : 0.885891
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003234 | Grad Max: 0.105935
  -> Layer: shared_layers.0.bias | Grad Mean: 0.203712 | Grad Max: 0.975250
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.006200
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001496 | Grad Max: 0.001496
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001321 | Grad Max: 0.144062
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024017 | Grad Max: 0.742641
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000191 | Grad Max: 0.009315
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011967 | Grad Max: 0.078282
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000303
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002383 | Grad Max: 0.005879
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000140
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000627 | Grad Max: 0.001618
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000554 | Grad Max: 0.001441
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011711 | Grad Max: 0.011711
[GRADIENT NORM TOTAL] 4.0917

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.660
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50271994 0.49728003] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.063
[MASKS] A(Pass/Fail): 667/1381 | B: 614/1434 | C: 456/1592
[LOSS Ex1] A: 0.65001 | B: 0.63795 | C: 0.63203
[LOGITS Ex2 A] Mean Abs: 1.980 | Max: 6.936
[LOSS Ex2] A: 0.13571 | B: 0.36814 | C: 0.27387
** [JOINT LOSS] ** : 0.899238
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.053875
  -> Layer: shared_layers.0.bias | Grad Mean: 0.114385 | Grad Max: 0.594906
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005993
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006627 | Grad Max: 0.006627
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000738 | Grad Max: 0.407275
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012807 | Grad Max: 2.284618
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003758
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002323 | Grad Max: 0.021632
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000166
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000402 | Grad Max: 0.002437
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000064
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000096 | Grad Max: 0.000557
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000239 | Grad Max: 0.000725
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000378 | Grad Max: 0.000378
[GRADIENT NORM TOTAL] 3.7333

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.587
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5421711  0.45782894] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.549 | Std: 0.061
[MASKS] A(Pass/Fail): 664/1384 | B: 608/1440 | C: 435/1613
[LOSS Ex1] A: 0.64995 | B: 0.63871 | C: 0.63554
[LOGITS Ex2 A] Mean Abs: 1.990 | Max: 6.168
[LOSS Ex2] A: 0.14144 | B: 0.36418 | C: 0.25021
** [JOINT LOSS] ** : 0.893343
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002472 | Grad Max: 0.089223
  -> Layer: shared_layers.0.bias | Grad Mean: 0.239104 | Grad Max: 1.088072
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.007028
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013601 | Grad Max: 0.013601
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001575 | Grad Max: 0.248893
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029019 | Grad Max: 1.374720
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.009452
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013204 | Grad Max: 0.093084
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000283
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002696 | Grad Max: 0.006036
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000152
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000709 | Grad Max: 0.001711
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000609 | Grad Max: 0.001701
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013418 | Grad Max: 0.013418
[GRADIENT NORM TOTAL] 5.2861

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.741
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7270694  0.27293068] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.065
[MASKS] A(Pass/Fail): 716/1332 | B: 611/1437 | C: 457/1591
[LOSS Ex1] A: 0.64443 | B: 0.63448 | C: 0.62980
[LOGITS Ex2 A] Mean Abs: 2.023 | Max: 6.096
[LOSS Ex2] A: 0.13692 | B: 0.34307 | C: 0.25782
** [JOINT LOSS] ** : 0.882177
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.078947
  -> Layer: shared_layers.0.bias | Grad Mean: 0.133900 | Grad Max: 0.780460
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002344 | Grad Max: 0.006276
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006137 | Grad Max: 0.006137
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000929 | Grad Max: 0.201871
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016759 | Grad Max: 1.122662
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.005642
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007131 | Grad Max: 0.047612
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000225
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001475 | Grad Max: 0.003994
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000103
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000396 | Grad Max: 0.001101
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000431 | Grad Max: 0.001378
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008272 | Grad Max: 0.008272
[GRADIENT NORM TOTAL] 3.3879

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.824
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009236  0.49907643] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 701/1347 | B: 571/1285 | C: 480/1568
[LOSS Ex1] A: 0.65064 | B: 0.63866 | C: 0.63252
[LOGITS Ex2 A] Mean Abs: 2.013 | Max: 6.156
[LOSS Ex2] A: 0.13074 | B: 0.36159 | C: 0.27079
** [JOINT LOSS] ** : 0.894979
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005684 | Grad Max: 0.177024
  -> Layer: shared_layers.0.bias | Grad Mean: 0.388773 | Grad Max: 1.814867
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.005789
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001777 | Grad Max: 0.001777
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002536 | Grad Max: 0.329032
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046437 | Grad Max: 1.835523
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000378 | Grad Max: 0.010614
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024109 | Grad Max: 0.103904
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000541
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005156 | Grad Max: 0.011330
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000277
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001371 | Grad Max: 0.003118
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001224 | Grad Max: 0.002398
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026363 | Grad Max: 0.026363
[GRADIENT NORM TOTAL] 7.9500

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.567
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6809912 0.3190088] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.065
[MASKS] A(Pass/Fail): 688/1360 | B: 615/1433 | C: 486/1562
[LOSS Ex1] A: 0.64685 | B: 0.63775 | C: 0.62877
[LOGITS Ex2 A] Mean Abs: 1.996 | Max: 6.321
[LOSS Ex2] A: 0.13863 | B: 0.37883 | C: 0.26521
** [JOINT LOSS] ** : 0.898682
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006835 | Grad Max: 0.165496
  -> Layer: shared_layers.0.bias | Grad Mean: 0.480119 | Grad Max: 2.233066
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.006170
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003412 | Grad Max: 0.003412
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003239 | Grad Max: 0.420965
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060280 | Grad Max: 2.383028
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000477 | Grad Max: 0.017146
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030583 | Grad Max: 0.169512
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000618
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006539 | Grad Max: 0.013409
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000324
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001742 | Grad Max: 0.003966
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001636 | Grad Max: 0.003128
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033659 | Grad Max: 0.033659
[GRADIENT NORM TOTAL] 10.0143

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.662
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6117492 0.3882508] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.065
[MASKS] A(Pass/Fail): 580/1036 | B: 608/1440 | C: 467/1581
[LOSS Ex1] A: 0.64519 | B: 0.63852 | C: 0.63333
[LOGITS Ex2 A] Mean Abs: 2.072 | Max: 5.854
[LOSS Ex2] A: 0.13299 | B: 0.36550 | C: 0.24724
** [JOINT LOSS] ** : 0.887586
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001739 | Grad Max: 0.038771
  -> Layer: shared_layers.0.bias | Grad Mean: 0.067468 | Grad Max: 0.246020
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.006164
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006282 | Grad Max: 0.006282
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000601 | Grad Max: 0.176060
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010358 | Grad Max: 0.983622
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.003067
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002130 | Grad Max: 0.026329
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000129
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000412 | Grad Max: 0.002555
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000094
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000123 | Grad Max: 0.000751
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000314 | Grad Max: 0.001325
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003497 | Grad Max: 0.003497
[GRADIENT NORM TOTAL] 2.3230

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.826
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072111  0.49278888] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.065
[MASKS] A(Pass/Fail): 696/1352 | B: 611/1437 | C: 438/1610
[LOSS Ex1] A: 0.64573 | B: 0.63428 | C: 0.63619
[LOGITS Ex2 A] Mean Abs: 2.078 | Max: 6.655
[LOSS Ex2] A: 0.13376 | B: 0.35517 | C: 0.28008
** [JOINT LOSS] ** : 0.895069
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006780 | Grad Max: 0.190956
  -> Layer: shared_layers.0.bias | Grad Mean: 0.515983 | Grad Max: 2.441994
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.006105
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003222 | Grad Max: 0.003222
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003459 | Grad Max: 0.422291
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064108 | Grad Max: 2.342909
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000498 | Grad Max: 0.017506
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031910 | Grad Max: 0.166072
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000703
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006796 | Grad Max: 0.014496
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000342
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001797 | Grad Max: 0.004556
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001566 | Grad Max: 0.003043
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033286 | Grad Max: 0.033286
[GRADIENT NORM TOTAL] 10.8763

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.778
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51085913 0.4891409 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.065
[MASKS] A(Pass/Fail): 692/1356 | B: 572/1284 | C: 319/1057
[LOSS Ex1] A: 0.64288 | B: 0.63848 | C: 0.62795
[LOGITS Ex2 A] Mean Abs: 2.076 | Max: 5.757
[LOSS Ex2] A: 0.14505 | B: 0.34675 | C: 0.28352
** [JOINT LOSS] ** : 0.894876
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008183 | Grad Max: 0.198739
  -> Layer: shared_layers.0.bias | Grad Mean: 0.535444 | Grad Max: 2.446313
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002289 | Grad Max: 0.006253
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002604 | Grad Max: 0.002604
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003584 | Grad Max: 0.428690
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.066546 | Grad Max: 2.426659
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000521 | Grad Max: 0.019291
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033356 | Grad Max: 0.187273
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000714
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007109 | Grad Max: 0.014969
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000340
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001890 | Grad Max: 0.004356
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001717 | Grad Max: 0.003316
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036102 | Grad Max: 0.036102
[GRADIENT NORM TOTAL] 11.0627

[EPOCH SUMMARY] Train Loss: 0.8920

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8710 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8754 -> New: 0.8710)

############################## EPOCH 113/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.803
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50348747 0.4965125 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.065
[MASKS] A(Pass/Fail): 690/1358 | B: 615/1433 | C: 502/1546
[LOSS Ex1] A: 0.64189 | B: 0.63757 | C: 0.63090
[LOGITS Ex2 A] Mean Abs: 2.027 | Max: 7.531
[LOSS Ex2] A: 0.15358 | B: 0.36731 | C: 0.25535
** [JOINT LOSS] ** : 0.895535
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004572 | Grad Max: 0.155407
  -> Layer: shared_layers.0.bias | Grad Mean: 0.100301 | Grad Max: 0.396994
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002267 | Grad Max: 0.006556
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000647 | Grad Max: 0.000647
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000913 | Grad Max: 0.189057
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015164 | Grad Max: 0.989198
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.004667
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006382 | Grad Max: 0.033952
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000263
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001505 | Grad Max: 0.004309
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000129
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000394 | Grad Max: 0.001401
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000383 | Grad Max: 0.001245
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007181 | Grad Max: 0.007181
[GRADIENT NORM TOTAL] 2.5892

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.664
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026603 0.4973397] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.064
[MASKS] A(Pass/Fail): 667/1381 | B: 608/1440 | C: 482/1566
[LOSS Ex1] A: 0.64978 | B: 0.63834 | C: 0.62979
[LOGITS Ex2 A] Mean Abs: 1.940 | Max: 6.582
[LOSS Ex2] A: 0.13784 | B: 0.38869 | C: 0.25689
** [JOINT LOSS] ** : 0.900442
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008436 | Grad Max: 0.227072
  -> Layer: shared_layers.0.bias | Grad Mean: 0.696499 | Grad Max: 3.126529
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005735
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004758 | Grad Max: 0.004758
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004410 | Grad Max: 0.679991
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.082163 | Grad Max: 3.798304
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000630 | Grad Max: 0.021641
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.040619 | Grad Max: 0.221516
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000777
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008662 | Grad Max: 0.017507
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000423
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002310 | Grad Max: 0.005449
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002088 | Grad Max: 0.003639
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044401 | Grad Max: 0.044401
[GRADIENT NORM TOTAL] 14.5566

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.591
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5423879 0.4576121] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.062
[MASKS] A(Pass/Fail): 665/1383 | B: 611/1437 | C: 469/1579
[LOSS Ex1] A: 0.64973 | B: 0.63410 | C: 0.63339
[LOGITS Ex2 A] Mean Abs: 1.913 | Max: 5.914
[LOSS Ex2] A: 0.15774 | B: 0.38131 | C: 0.27995
** [JOINT LOSS] ** : 0.912072
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014229 | Grad Max: 0.398510
  -> Layer: shared_layers.0.bias | Grad Mean: 0.880956 | Grad Max: 3.547540
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.006079
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008153 | Grad Max: 0.008153
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005706 | Grad Max: 0.734843
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.105726 | Grad Max: 4.081692
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000841 | Grad Max: 0.027193
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.053681 | Grad Max: 0.272178
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000108 | Grad Max: 0.001112
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011534 | Grad Max: 0.022775
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000552
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003066 | Grad Max: 0.007079
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002812 | Grad Max: 0.005303
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.058902 | Grad Max: 0.058902
[GRADIENT NORM TOTAL] 17.7423

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.746
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.72842014 0.27157986] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.065
[MASKS] A(Pass/Fail): 719/1329 | B: 572/1284 | C: 468/1580
[LOSS Ex1] A: 0.64418 | B: 0.63830 | C: 0.63218
[LOGITS Ex2 A] Mean Abs: 1.977 | Max: 6.580
[LOSS Ex2] A: 0.13770 | B: 0.36371 | C: 0.25562
** [JOINT LOSS] ** : 0.890565
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010567 | Grad Max: 0.324502
  -> Layer: shared_layers.0.bias | Grad Mean: 0.632645 | Grad Max: 2.534528
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002287 | Grad Max: 0.006362
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007824 | Grad Max: 0.007824
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004068 | Grad Max: 0.563412
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074682 | Grad Max: 3.142462
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000596 | Grad Max: 0.018198
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037884 | Grad Max: 0.187550
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000814
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008213 | Grad Max: 0.016507
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000417
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002164 | Grad Max: 0.005097
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001916 | Grad Max: 0.003397
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040542 | Grad Max: 0.040542
[GRADIENT NORM TOTAL] 12.6861

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.829
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50093406 0.49906594] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 702/1346 | B: 615/1433 | C: 442/1606
[LOSS Ex1] A: 0.65043 | B: 0.63741 | C: 0.63290
[LOGITS Ex2 A] Mean Abs: 2.022 | Max: 5.851
[LOSS Ex2] A: 0.13832 | B: 0.36000 | C: 0.26355
** [JOINT LOSS] ** : 0.894203
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005077 | Grad Max: 0.214376
  -> Layer: shared_layers.0.bias | Grad Mean: 0.085152 | Grad Max: 0.301566
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.005797
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001161 | Grad Max: 0.001161
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000878 | Grad Max: 0.154376
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012946 | Grad Max: 0.879697
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.003350
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002527 | Grad Max: 0.023689
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000210
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000422 | Grad Max: 0.002151
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000090
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000115 | Grad Max: 0.000829
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000271 | Grad Max: 0.001060
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002321 | Grad Max: 0.002321
[GRADIENT NORM TOTAL] 2.4129

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.570
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.68188864 0.3181113 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.065
[MASKS] A(Pass/Fail): 690/1358 | B: 608/1440 | C: 453/1595
[LOSS Ex1] A: 0.64662 | B: 0.63817 | C: 0.63642
[LOGITS Ex2 A] Mean Abs: 2.037 | Max: 5.938
[LOSS Ex2] A: 0.15485 | B: 0.37746 | C: 0.29488
** [JOINT LOSS] ** : 0.916136
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005568 | Grad Max: 0.237527
  -> Layer: shared_layers.0.bias | Grad Mean: 0.622633 | Grad Max: 3.022476
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.005626
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006132 | Grad Max: 0.006132
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003972 | Grad Max: 0.598495
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074318 | Grad Max: 3.335666
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000569 | Grad Max: 0.021732
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037139 | Grad Max: 0.199512
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000786
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007727 | Grad Max: 0.015735
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000380
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002031 | Grad Max: 0.004809
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001711 | Grad Max: 0.003129
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037697 | Grad Max: 0.037697
[GRADIENT NORM TOTAL] 13.4832

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.667
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61221707 0.3877829 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.065
[MASKS] A(Pass/Fail): 581/1035 | B: 611/1437 | C: 447/1601
[LOSS Ex1] A: 0.64496 | B: 0.63394 | C: 0.63197
[LOGITS Ex2 A] Mean Abs: 2.083 | Max: 6.593
[LOSS Ex2] A: 0.14701 | B: 0.35836 | C: 0.27406
** [JOINT LOSS] ** : 0.896768
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009897 | Grad Max: 0.299263
  -> Layer: shared_layers.0.bias | Grad Mean: 0.758540 | Grad Max: 3.683718
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.006310
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001684 | Grad Max: 0.001684
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004872 | Grad Max: 0.605861
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.091067 | Grad Max: 3.386441
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000714 | Grad Max: 0.026437
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.046253 | Grad Max: 0.260030
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000911
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009802 | Grad Max: 0.018922
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000455
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002615 | Grad Max: 0.005867
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002276 | Grad Max: 0.004143
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.049546 | Grad Max: 0.049546
[GRADIENT NORM TOTAL] 15.5461

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.831
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072426  0.49275738] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.066
[MASKS] A(Pass/Fail): 696/1352 | B: 572/1284 | C: 497/1551
[LOSS Ex1] A: 0.64550 | B: 0.63815 | C: 0.63035
[LOGITS Ex2 A] Mean Abs: 2.055 | Max: 7.321
[LOSS Ex2] A: 0.14457 | B: 0.33981 | C: 0.26385
** [JOINT LOSS] ** : 0.887411
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009890 | Grad Max: 0.321778
  -> Layer: shared_layers.0.bias | Grad Mean: 0.439954 | Grad Max: 1.897734
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002244 | Grad Max: 0.006134
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005079 | Grad Max: 0.005079
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003134 | Grad Max: 0.433963
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056906 | Grad Max: 2.419709
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000442 | Grad Max: 0.014123
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027955 | Grad Max: 0.140769
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000615
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006173 | Grad Max: 0.012685
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000289
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001664 | Grad Max: 0.003701
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001524 | Grad Max: 0.003333
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032415 | Grad Max: 0.032415
[GRADIENT NORM TOTAL] 9.4027

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.783
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108184 0.4891816] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.065
[MASKS] A(Pass/Fail): 692/1356 | B: 615/1433 | C: 450/1598
[LOSS Ex1] A: 0.64265 | B: 0.63726 | C: 0.63252
[LOGITS Ex2 A] Mean Abs: 1.979 | Max: 5.955
[LOSS Ex2] A: 0.14553 | B: 0.37861 | C: 0.27177
** [JOINT LOSS] ** : 0.902781
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004789 | Grad Max: 0.145140
  -> Layer: shared_layers.0.bias | Grad Mean: 0.358868 | Grad Max: 1.780424
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006322
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001109 | Grad Max: 0.001109
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002299 | Grad Max: 0.276873
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041123 | Grad Max: 1.547852
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000327 | Grad Max: 0.013432
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021051 | Grad Max: 0.134929
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000427
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004195 | Grad Max: 0.008795
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000189
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001109 | Grad Max: 0.002538
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000868 | Grad Max: 0.001871
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020812 | Grad Max: 0.020812
[GRADIENT NORM TOTAL] 7.3293

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.807
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50357676 0.4964232 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.065
[MASKS] A(Pass/Fail): 690/1358 | B: 608/1440 | C: 491/1557
[LOSS Ex1] A: 0.64166 | B: 0.63803 | C: 0.62646
[LOGITS Ex2 A] Mean Abs: 1.939 | Max: 6.349
[LOSS Ex2] A: 0.14867 | B: 0.38413 | C: 0.26003
** [JOINT LOSS] ** : 0.899663
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006551 | Grad Max: 0.214993
  -> Layer: shared_layers.0.bias | Grad Mean: 0.570267 | Grad Max: 2.828212
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002332 | Grad Max: 0.006755
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004908 | Grad Max: 0.004908
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003621 | Grad Max: 0.425587
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065756 | Grad Max: 2.309348
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000538 | Grad Max: 0.020422
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034806 | Grad Max: 0.199280
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000750
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006997 | Grad Max: 0.014169
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000318
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001858 | Grad Max: 0.004365
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001531 | Grad Max: 0.002845
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035202 | Grad Max: 0.035202
[GRADIENT NORM TOTAL] 11.6467

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.668
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026857 0.4973143] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.550 | Std: 0.064
[MASKS] A(Pass/Fail): 667/1381 | B: 611/1437 | C: 470/1578
[LOSS Ex1] A: 0.64958 | B: 0.63380 | C: 0.63192
[LOGITS Ex2 A] Mean Abs: 1.919 | Max: 6.080
[LOSS Ex2] A: 0.13885 | B: 0.34897 | C: 0.23909
** [JOINT LOSS] ** : 0.880740
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003177 | Grad Max: 0.103940
  -> Layer: shared_layers.0.bias | Grad Mean: 0.302691 | Grad Max: 1.559144
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.005991
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004328 | Grad Max: 0.004328
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.346079
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036305 | Grad Max: 1.909710
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000295 | Grad Max: 0.013076
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019025 | Grad Max: 0.116565
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000421
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003837 | Grad Max: 0.009086
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000205
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000986 | Grad Max: 0.002707
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000788 | Grad Max: 0.002286
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017306 | Grad Max: 0.017306
[GRADIENT NORM TOTAL] 6.6725

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.594
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54235786 0.45764217] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.062
[MASKS] A(Pass/Fail): 665/1383 | B: 572/1284 | C: 453/1595
[LOSS Ex1] A: 0.64954 | B: 0.63802 | C: 0.63210
[LOGITS Ex2 A] Mean Abs: 1.949 | Max: 6.617
[LOSS Ex2] A: 0.14516 | B: 0.34171 | C: 0.26983
** [JOINT LOSS] ** : 0.892118
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007279 | Grad Max: 0.246013
  -> Layer: shared_layers.0.bias | Grad Mean: 0.271856 | Grad Max: 0.807233
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005665
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003825 | Grad Max: 0.003825
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001922 | Grad Max: 0.239517
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033549 | Grad Max: 1.265776
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000260 | Grad Max: 0.008187
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016345 | Grad Max: 0.076572
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000439
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003753 | Grad Max: 0.008153
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000175
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001028 | Grad Max: 0.002199
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000966 | Grad Max: 0.002351
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020530 | Grad Max: 0.020530
[GRADIENT NORM TOTAL] 5.4698

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.750
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7293593 0.2706407] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.065
[MASKS] A(Pass/Fail): 719/1329 | B: 615/1433 | C: 463/1585
[LOSS Ex1] A: 0.64399 | B: 0.63712 | C: 0.63292
[LOGITS Ex2 A] Mean Abs: 1.998 | Max: 6.421
[LOSS Ex2] A: 0.13632 | B: 0.36559 | C: 0.27684
** [JOINT LOSS] ** : 0.897595
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008374 | Grad Max: 0.252368
  -> Layer: shared_layers.0.bias | Grad Mean: 0.467190 | Grad Max: 2.051966
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.006408
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006166 | Grad Max: 0.006166
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003165 | Grad Max: 0.400174
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057902 | Grad Max: 2.235495
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000447 | Grad Max: 0.013809
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028711 | Grad Max: 0.143694
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000603
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006235 | Grad Max: 0.012460
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000296
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001657 | Grad Max: 0.003744
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001456 | Grad Max: 0.003073
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031024 | Grad Max: 0.031024
[GRADIENT NORM TOTAL] 9.7211

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.833
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008782  0.49912176] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.064
[MASKS] A(Pass/Fail): 702/1346 | B: 608/1440 | C: 304/1072
[LOSS Ex1] A: 0.65025 | B: 0.63790 | C: 0.63424
[LOGITS Ex2 A] Mean Abs: 1.990 | Max: 5.953
[LOSS Ex2] A: 0.12873 | B: 0.36407 | C: 0.26680
** [JOINT LOSS] ** : 0.893999
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002768 | Grad Max: 0.067927
  -> Layer: shared_layers.0.bias | Grad Mean: 0.103829 | Grad Max: 0.664709
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002076 | Grad Max: 0.005329
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001092 | Grad Max: 0.001092
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000781 | Grad Max: 0.185141
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013160 | Grad Max: 1.033513
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.002929
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002610 | Grad Max: 0.020000
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000386 | Grad Max: 0.002829
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000057
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000085 | Grad Max: 0.000562
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000389 | Grad Max: 0.001091
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000617 | Grad Max: 0.000617
[GRADIENT NORM TOTAL] 2.6177

[EPOCH SUMMARY] Train Loss: 0.8971

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8756 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 114/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.573
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6825978  0.31740215] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.065
[MASKS] A(Pass/Fail): 691/1357 | B: 611/1437 | C: 457/1591
[LOSS Ex1] A: 0.64642 | B: 0.63367 | C: 0.63519
[LOGITS Ex2 A] Mean Abs: 1.940 | Max: 5.853
[LOSS Ex2] A: 0.14793 | B: 0.35119 | C: 0.27447
** [JOINT LOSS] ** : 0.896294
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008655 | Grad Max: 0.251226
  -> Layer: shared_layers.0.bias | Grad Mean: 0.379058 | Grad Max: 1.588648
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.005948
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001507 | Grad Max: 0.001507
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002513 | Grad Max: 0.320286
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045768 | Grad Max: 1.779865
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000356 | Grad Max: 0.010454
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022539 | Grad Max: 0.110005
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000502
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005011 | Grad Max: 0.010297
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000270
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001370 | Grad Max: 0.003287
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001285 | Grad Max: 0.002612
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026945 | Grad Max: 0.026945
[GRADIENT NORM TOTAL] 7.3848

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.670
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6126545  0.38734546] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.065
[MASKS] A(Pass/Fail): 582/1034 | B: 571/1285 | C: 413/1635
[LOSS Ex1] A: 0.64476 | B: 0.63789 | C: 0.63649
[LOGITS Ex2 A] Mean Abs: 2.002 | Max: 7.265
[LOSS Ex2] A: 0.14059 | B: 0.33931 | C: 0.26484
** [JOINT LOSS] ** : 0.887961
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006038 | Grad Max: 0.172470
  -> Layer: shared_layers.0.bias | Grad Mean: 0.284050 | Grad Max: 1.180224
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.005739
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006720 | Grad Max: 0.006720
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001964 | Grad Max: 0.259527
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035655 | Grad Max: 1.451970
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.010425
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017957 | Grad Max: 0.102355
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000508
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003918 | Grad Max: 0.008982
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000192
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001056 | Grad Max: 0.002392
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001031 | Grad Max: 0.002272
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021315 | Grad Max: 0.021315
[GRADIENT NORM TOTAL] 5.7547

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.835
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50726825 0.49273178] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.066
[MASKS] A(Pass/Fail): 696/1352 | B: 615/1433 | C: 477/1571
[LOSS Ex1] A: 0.64531 | B: 0.63699 | C: 0.63047
[LOGITS Ex2 A] Mean Abs: 2.023 | Max: 7.386
[LOSS Ex2] A: 0.12978 | B: 0.36120 | C: 0.24533
** [JOINT LOSS] ** : 0.883025
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003307 | Grad Max: 0.161938
  -> Layer: shared_layers.0.bias | Grad Mean: 0.247138 | Grad Max: 1.166225
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.006060
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001360 | Grad Max: 0.001360
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001573 | Grad Max: 0.223916
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028143 | Grad Max: 1.238459
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000214 | Grad Max: 0.008642
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013714 | Grad Max: 0.085137
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000304
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002789 | Grad Max: 0.006193
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000141
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000754 | Grad Max: 0.001807
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000581 | Grad Max: 0.001899
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014070 | Grad Max: 0.014070
[GRADIENT NORM TOTAL] 5.0449

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.787
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5107768 0.4892232] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.065
[MASKS] A(Pass/Fail): 692/1356 | B: 608/1440 | C: 489/1559
[LOSS Ex1] A: 0.64245 | B: 0.63777 | C: 0.62623
[LOGITS Ex2 A] Mean Abs: 2.024 | Max: 6.085
[LOSS Ex2] A: 0.13435 | B: 0.35782 | C: 0.28383
** [JOINT LOSS] ** : 0.894147
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004781 | Grad Max: 0.133480
  -> Layer: shared_layers.0.bias | Grad Mean: 0.378912 | Grad Max: 1.497638
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002329 | Grad Max: 0.006681
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003655 | Grad Max: 0.003655
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002376 | Grad Max: 0.418860
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043881 | Grad Max: 2.335446
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000345 | Grad Max: 0.011071
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022407 | Grad Max: 0.111350
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000521
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004792 | Grad Max: 0.010877
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000230
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001295 | Grad Max: 0.003107
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001122 | Grad Max: 0.002440
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024895 | Grad Max: 0.024895
[GRADIENT NORM TOTAL] 7.6516

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.811
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50367737 0.49632263] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.065
[MASKS] A(Pass/Fail): 691/1357 | B: 611/1437 | C: 463/1585
[LOSS Ex1] A: 0.64147 | B: 0.63354 | C: 0.63022
[LOGITS Ex2 A] Mean Abs: 1.985 | Max: 7.819
[LOSS Ex2] A: 0.14757 | B: 0.33440 | C: 0.26849
** [JOINT LOSS] ** : 0.885229
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002501 | Grad Max: 0.084452
  -> Layer: shared_layers.0.bias | Grad Mean: 0.055461 | Grad Max: 0.244953
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002326 | Grad Max: 0.006418
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000127 | Grad Max: 0.000127
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000599 | Grad Max: 0.124550
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009602 | Grad Max: 0.704680
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.003098
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001918 | Grad Max: 0.016809
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000161
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000372 | Grad Max: 0.002084
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000065
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000112 | Grad Max: 0.000604
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000358 | Grad Max: 0.000976
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002036 | Grad Max: 0.002036
[GRADIENT NORM TOTAL] 1.9013

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.672
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026987 0.4973013] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.064
[MASKS] A(Pass/Fail): 667/1381 | B: 572/1284 | C: 473/1575
[LOSS Ex1] A: 0.64939 | B: 0.63775 | C: 0.63069
[LOGITS Ex2 A] Mean Abs: 1.956 | Max: 6.823
[LOSS Ex2] A: 0.13092 | B: 0.34824 | C: 0.24815
** [JOINT LOSS] ** : 0.881712
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005183 | Grad Max: 0.148027
  -> Layer: shared_layers.0.bias | Grad Mean: 0.467768 | Grad Max: 2.076950
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005816
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006372 | Grad Max: 0.006372
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003043 | Grad Max: 0.397593
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056806 | Grad Max: 2.237488
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000433 | Grad Max: 0.015979
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028327 | Grad Max: 0.171141
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000616
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006005 | Grad Max: 0.012262
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000315
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001579 | Grad Max: 0.003928
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001362 | Grad Max: 0.002883
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029185 | Grad Max: 0.029185
[GRADIENT NORM TOTAL] 9.8323

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.598
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54243743 0.4575626 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.062
[MASKS] A(Pass/Fail): 666/1382 | B: 615/1433 | C: 456/1592
[LOSS Ex1] A: 0.64936 | B: 0.63684 | C: 0.63258
[LOGITS Ex2 A] Mean Abs: 1.936 | Max: 6.819
[LOSS Ex2] A: 0.14241 | B: 0.37504 | C: 0.27552
** [JOINT LOSS] ** : 0.903919
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004304 | Grad Max: 0.133842
  -> Layer: shared_layers.0.bias | Grad Mean: 0.386256 | Grad Max: 1.882718
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.005509
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004587 | Grad Max: 0.004587
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002404 | Grad Max: 0.366781
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044259 | Grad Max: 2.057684
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000313 | Grad Max: 0.011190
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020356 | Grad Max: 0.111854
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000416
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004356 | Grad Max: 0.009058
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000216
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001172 | Grad Max: 0.002795
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001059 | Grad Max: 0.002035
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023147 | Grad Max: 0.023147
[GRADIENT NORM TOTAL] 8.2988

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.754
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7304071  0.26959288] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.066
[MASKS] A(Pass/Fail): 719/1329 | B: 608/1440 | C: 491/1557
[LOSS Ex1] A: 0.64379 | B: 0.63761 | C: 0.63059
[LOGITS Ex2 A] Mean Abs: 2.006 | Max: 5.763
[LOSS Ex2] A: 0.12949 | B: 0.35836 | C: 0.27006
** [JOINT LOSS] ** : 0.889966
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003635 | Grad Max: 0.102892
  -> Layer: shared_layers.0.bias | Grad Mean: 0.171771 | Grad Max: 0.667385
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002207 | Grad Max: 0.006328
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002225 | Grad Max: 0.002225
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001259 | Grad Max: 0.182739
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022574 | Grad Max: 1.023749
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.007757
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011087 | Grad Max: 0.073465
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000294
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002419 | Grad Max: 0.006118
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000124
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000654 | Grad Max: 0.001517
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000519 | Grad Max: 0.001666
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011816 | Grad Max: 0.011816
[GRADIENT NORM TOTAL] 3.6973

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.838
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008451  0.49915493] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.065
[MASKS] A(Pass/Fail): 703/1345 | B: 611/1437 | C: 461/1587
[LOSS Ex1] A: 0.65006 | B: 0.63336 | C: 0.63172
[LOGITS Ex2 A] Mean Abs: 2.030 | Max: 6.010
[LOSS Ex2] A: 0.13002 | B: 0.33192 | C: 0.26190
** [JOINT LOSS] ** : 0.879665
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002935 | Grad Max: 0.074390
  -> Layer: shared_layers.0.bias | Grad Mean: 0.149697 | Grad Max: 0.736040
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.005560
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003566 | Grad Max: 0.003566
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001053 | Grad Max: 0.158942
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018699 | Grad Max: 0.883926
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.006179
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008715 | Grad Max: 0.059781
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000254
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001639 | Grad Max: 0.004773
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000104
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000443 | Grad Max: 0.001160
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000468 | Grad Max: 0.001659
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008496 | Grad Max: 0.008496
[GRADIENT NORM TOTAL] 3.1653

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.576
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6833938 0.3166062] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.065
[MASKS] A(Pass/Fail): 691/1357 | B: 572/1284 | C: 455/1593
[LOSS Ex1] A: 0.64622 | B: 0.63757 | C: 0.63371
[LOGITS Ex2 A] Mean Abs: 1.983 | Max: 5.801
[LOSS Ex2] A: 0.15207 | B: 0.34640 | C: 0.23757
** [JOINT LOSS] ** : 0.884513
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004642 | Grad Max: 0.150627
  -> Layer: shared_layers.0.bias | Grad Mean: 0.301995 | Grad Max: 1.116721
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002142 | Grad Max: 0.005898
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002200 | Grad Max: 0.002200
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.290062
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035239 | Grad Max: 1.625833
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.009889
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018197 | Grad Max: 0.095567
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000413
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003977 | Grad Max: 0.009211
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000211
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001055 | Grad Max: 0.002540
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000879 | Grad Max: 0.002051
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019273 | Grad Max: 0.019273
[GRADIENT NORM TOTAL] 5.9601

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.674
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6130964 0.3869036] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.066
[MASKS] A(Pass/Fail): 582/1034 | B: 615/1433 | C: 470/1578
[LOSS Ex1] A: 0.64455 | B: 0.63667 | C: 0.63090
[LOGITS Ex2 A] Mean Abs: 2.014 | Max: 5.684
[LOSS Ex2] A: 0.13471 | B: 0.37210 | C: 0.24716
** [JOINT LOSS] ** : 0.888698
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002297 | Grad Max: 0.070394
  -> Layer: shared_layers.0.bias | Grad Mean: 0.238231 | Grad Max: 1.012541
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.006579
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003793 | Grad Max: 0.003793
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001513 | Grad Max: 0.350475
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027763 | Grad Max: 1.975450
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.010046
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013548 | Grad Max: 0.084816
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000295
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002845 | Grad Max: 0.006547
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000163
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000765 | Grad Max: 0.002062
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000658 | Grad Max: 0.001666
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014685 | Grad Max: 0.014685
[GRADIENT NORM TOTAL] 5.2917

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.840
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50728023 0.4927198 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.066
[MASKS] A(Pass/Fail): 696/1352 | B: 608/1440 | C: 461/1587
[LOSS Ex1] A: 0.64510 | B: 0.63743 | C: 0.63274
[LOGITS Ex2 A] Mean Abs: 2.015 | Max: 7.032
[LOSS Ex2] A: 0.12976 | B: 0.35909 | C: 0.27371
** [JOINT LOSS] ** : 0.892608
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003106 | Grad Max: 0.114926
  -> Layer: shared_layers.0.bias | Grad Mean: 0.263704 | Grad Max: 1.504686
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.005520
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000385 | Grad Max: 0.000385
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001732 | Grad Max: 0.306102
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031979 | Grad Max: 1.728538
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000224 | Grad Max: 0.008031
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014628 | Grad Max: 0.077573
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000333
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003087 | Grad Max: 0.007631
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000151
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000786 | Grad Max: 0.001952
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000615 | Grad Max: 0.001640
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013534 | Grad Max: 0.013534
[GRADIENT NORM TOTAL] 6.0667

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.791
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108119  0.48918808] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.066
[MASKS] A(Pass/Fail): 692/1356 | B: 611/1437 | C: 497/1551
[LOSS Ex1] A: 0.64222 | B: 0.63319 | C: 0.62824
[LOGITS Ex2 A] Mean Abs: 2.016 | Max: 6.486
[LOSS Ex2] A: 0.14039 | B: 0.33665 | C: 0.23899
** [JOINT LOSS] ** : 0.873228
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004152 | Grad Max: 0.122086
  -> Layer: shared_layers.0.bias | Grad Mean: 0.285836 | Grad Max: 1.574270
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002375 | Grad Max: 0.006127
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001407 | Grad Max: 0.001407
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001920 | Grad Max: 0.310324
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035286 | Grad Max: 1.727903
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000262 | Grad Max: 0.010727
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017228 | Grad Max: 0.107848
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000373
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003682 | Grad Max: 0.007959
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000182
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000978 | Grad Max: 0.002484
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000827 | Grad Max: 0.002242
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018566 | Grad Max: 0.018566
[GRADIENT NORM TOTAL] 6.4528

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.816
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5036692 0.4963308] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.065
[MASKS] A(Pass/Fail): 692/1356 | B: 572/1284 | C: 331/1045
[LOSS Ex1] A: 0.64123 | B: 0.63740 | C: 0.63029
[LOGITS Ex2 A] Mean Abs: 1.979 | Max: 6.902
[LOSS Ex2] A: 0.14902 | B: 0.34024 | C: 0.28270
** [JOINT LOSS] ** : 0.893626
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003417 | Grad Max: 0.107495
  -> Layer: shared_layers.0.bias | Grad Mean: 0.126889 | Grad Max: 0.712899
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002290 | Grad Max: 0.006376
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004461 | Grad Max: 0.004461
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000928 | Grad Max: 0.147504
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015242 | Grad Max: 0.778373
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.003567
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003399 | Grad Max: 0.033007
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000159
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000521 | Grad Max: 0.002464
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000057
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000136 | Grad Max: 0.000616
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000181 | Grad Max: 0.000762
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002574 | Grad Max: 0.002574
[GRADIENT NORM TOTAL] 2.8785

[EPOCH SUMMARY] Train Loss: 0.8882

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8707 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8710 -> New: 0.8707)

############################## EPOCH 115/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.676
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50261533 0.4973846 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.064
[MASKS] A(Pass/Fail): 667/1381 | B: 615/1433 | C: 467/1581
[LOSS Ex1] A: 0.64919 | B: 0.63650 | C: 0.63048
[LOGITS Ex2 A] Mean Abs: 1.934 | Max: 5.933
[LOSS Ex2] A: 0.13478 | B: 0.37024 | C: 0.25907
** [JOINT LOSS] ** : 0.893422
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004475 | Grad Max: 0.103009
  -> Layer: shared_layers.0.bias | Grad Mean: 0.255799 | Grad Max: 1.205365
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.005868
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000941 | Grad Max: 0.000941
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001815 | Grad Max: 0.309473
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033791 | Grad Max: 1.749633
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000268 | Grad Max: 0.008641
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017225 | Grad Max: 0.092026
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000362
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003676 | Grad Max: 0.008145
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000187
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000991 | Grad Max: 0.002320
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000934 | Grad Max: 0.002075
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019351 | Grad Max: 0.019351
[GRADIENT NORM TOTAL] 5.5997

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.602
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54257524 0.45742473] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.063
[MASKS] A(Pass/Fail): 666/1382 | B: 609/1439 | C: 451/1597
[LOSS Ex1] A: 0.64915 | B: 0.63726 | C: 0.63368
[LOGITS Ex2 A] Mean Abs: 1.949 | Max: 5.951
[LOSS Ex2] A: 0.14232 | B: 0.36852 | C: 0.26778
** [JOINT LOSS] ** : 0.899576
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.063664
  -> Layer: shared_layers.0.bias | Grad Mean: 0.142491 | Grad Max: 0.793353
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.006331
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010423 | Grad Max: 0.010423
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000946 | Grad Max: 0.218812
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016986 | Grad Max: 1.212684
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000098 | Grad Max: 0.006741
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006206 | Grad Max: 0.059539
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000195
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001214 | Grad Max: 0.003795
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000091
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000292 | Grad Max: 0.001013
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000237 | Grad Max: 0.000869
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003585 | Grad Max: 0.003585
[GRADIENT NORM TOTAL] 3.4889

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.759
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7316437  0.26835635] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.066
[MASKS] A(Pass/Fail): 720/1328 | B: 611/1437 | C: 500/1548
[LOSS Ex1] A: 0.64356 | B: 0.63301 | C: 0.62741
[LOGITS Ex2 A] Mean Abs: 1.990 | Max: 5.949
[LOSS Ex2] A: 0.12478 | B: 0.34607 | C: 0.25921
** [JOINT LOSS] ** : 0.878014
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002464 | Grad Max: 0.077457
  -> Layer: shared_layers.0.bias | Grad Mean: 0.140781 | Grad Max: 0.630270
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002364 | Grad Max: 0.006406
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006408 | Grad Max: 0.006408
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000941 | Grad Max: 0.183176
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016661 | Grad Max: 1.022259
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.005224
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005511 | Grad Max: 0.036755
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000219
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001228 | Grad Max: 0.003347
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000086
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000335 | Grad Max: 0.000917
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000415 | Grad Max: 0.001421
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006771 | Grad Max: 0.006771
[GRADIENT NORM TOTAL] 3.2763

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.843
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008713  0.49912864] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.065
[MASKS] A(Pass/Fail): 703/1345 | B: 573/1283 | C: 450/1598
[LOSS Ex1] A: 0.64985 | B: 0.63724 | C: 0.63160
[LOGITS Ex2 A] Mean Abs: 1.985 | Max: 5.588
[LOSS Ex2] A: 0.12021 | B: 0.33936 | C: 0.28324
** [JOINT LOSS] ** : 0.887167
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003879 | Grad Max: 0.107966
  -> Layer: shared_layers.0.bias | Grad Mean: 0.159343 | Grad Max: 0.787123
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005773
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002359 | Grad Max: 0.002359
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001210 | Grad Max: 0.219242
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021908 | Grad Max: 1.223605
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000157 | Grad Max: 0.006364
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009999 | Grad Max: 0.050697
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000265
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002228 | Grad Max: 0.005546
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000150
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000604 | Grad Max: 0.001800
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000565 | Grad Max: 0.001609
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011735 | Grad Max: 0.011735
[GRADIENT NORM TOTAL] 3.5921

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.580
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6842632  0.31573677] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.066
[MASKS] A(Pass/Fail): 690/1358 | B: 616/1432 | C: 475/1573
[LOSS Ex1] A: 0.64598 | B: 0.63634 | C: 0.62811
[LOGITS Ex2 A] Mean Abs: 1.991 | Max: 5.813
[LOSS Ex2] A: 0.14317 | B: 0.36348 | C: 0.23737
** [JOINT LOSS] ** : 0.884815
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002515 | Grad Max: 0.043374
  -> Layer: shared_layers.0.bias | Grad Mean: 0.106495 | Grad Max: 0.529414
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002226 | Grad Max: 0.006004
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001035 | Grad Max: 0.001035
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000678 | Grad Max: 0.264565
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011676 | Grad Max: 1.496614
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.002851
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002818 | Grad Max: 0.024958
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000148
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000562 | Grad Max: 0.002527
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000092
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000142 | Grad Max: 0.000761
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000409 | Grad Max: 0.001052
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001728 | Grad Max: 0.001728
[GRADIENT NORM TOTAL] 3.0609

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.678
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6135699  0.38643003] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.066
[MASKS] A(Pass/Fail): 581/1035 | B: 610/1438 | C: 439/1609
[LOSS Ex1] A: 0.64430 | B: 0.63709 | C: 0.63439
[LOGITS Ex2 A] Mean Abs: 2.043 | Max: 6.362
[LOSS Ex2] A: 0.13377 | B: 0.36514 | C: 0.28525
** [JOINT LOSS] ** : 0.899981
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005249 | Grad Max: 0.116040
  -> Layer: shared_layers.0.bias | Grad Mean: 0.330195 | Grad Max: 1.472547
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002143 | Grad Max: 0.006170
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000818 | Grad Max: 0.000818
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.285612
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040109 | Grad Max: 1.603897
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.011988
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019549 | Grad Max: 0.101784
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000412
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004136 | Grad Max: 0.008770
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000197
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001086 | Grad Max: 0.002662
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000919 | Grad Max: 0.002114
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019603 | Grad Max: 0.019603
[GRADIENT NORM TOTAL] 6.7389

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.845
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50727564 0.4927244 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.066
[MASKS] A(Pass/Fail): 698/1350 | B: 612/1436 | C: 455/1593
[LOSS Ex1] A: 0.64486 | B: 0.63284 | C: 0.63100
[LOGITS Ex2 A] Mean Abs: 2.017 | Max: 6.661
[LOSS Ex2] A: 0.13678 | B: 0.34742 | C: 0.26300
** [JOINT LOSS] ** : 0.885298
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002486 | Grad Max: 0.052567
  -> Layer: shared_layers.0.bias | Grad Mean: 0.132829 | Grad Max: 0.657448
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002282 | Grad Max: 0.005642
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001575 | Grad Max: 0.001575
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000984 | Grad Max: 0.202102
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017770 | Grad Max: 1.134006
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000121 | Grad Max: 0.004590
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007762 | Grad Max: 0.040765
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000228
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001631 | Grad Max: 0.004946
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000097
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000430 | Grad Max: 0.001075
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000390 | Grad Max: 0.001293
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007625 | Grad Max: 0.007625
[GRADIENT NORM TOTAL] 3.1927

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.797
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5107413  0.48925874] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.066
[MASKS] A(Pass/Fail): 692/1356 | B: 573/1283 | C: 460/1588
[LOSS Ex1] A: 0.64197 | B: 0.63706 | C: 0.63263
[LOGITS Ex2 A] Mean Abs: 2.007 | Max: 6.503
[LOSS Ex2] A: 0.13152 | B: 0.35843 | C: 0.26367
** [JOINT LOSS] ** : 0.888429
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002977 | Grad Max: 0.155390
  -> Layer: shared_layers.0.bias | Grad Mean: 0.438176 | Grad Max: 2.079232
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006721
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004190 | Grad Max: 0.004190
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002667 | Grad Max: 0.287820
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049626 | Grad Max: 1.617141
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.016836
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025879 | Grad Max: 0.151395
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000458
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005340 | Grad Max: 0.010700
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000244
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001414 | Grad Max: 0.003154
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001109 | Grad Max: 0.002434
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025984 | Grad Max: 0.025984
[GRADIENT NORM TOTAL] 8.9674

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.822
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.503761   0.49623898] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.066
[MASKS] A(Pass/Fail): 692/1356 | B: 616/1432 | C: 492/1556
[LOSS Ex1] A: 0.64097 | B: 0.63616 | C: 0.62887
[LOGITS Ex2 A] Mean Abs: 1.971 | Max: 6.370
[LOSS Ex2] A: 0.15419 | B: 0.37670 | C: 0.27775
** [JOINT LOSS] ** : 0.904882
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004731 | Grad Max: 0.173977
  -> Layer: shared_layers.0.bias | Grad Mean: 0.513686 | Grad Max: 2.363801
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002248 | Grad Max: 0.006473
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005538 | Grad Max: 0.005538
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003223 | Grad Max: 0.387827
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060068 | Grad Max: 2.162498
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000464 | Grad Max: 0.015960
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030785 | Grad Max: 0.169271
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000577
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006338 | Grad Max: 0.012421
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000289
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001687 | Grad Max: 0.003936
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001421 | Grad Max: 0.002646
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032210 | Grad Max: 0.032210
[GRADIENT NORM TOTAL] 10.5389

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.681
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026096 0.4973904] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.065
[MASKS] A(Pass/Fail): 667/1381 | B: 610/1438 | C: 465/1583
[LOSS Ex1] A: 0.64895 | B: 0.63691 | C: 0.63146
[LOGITS Ex2 A] Mean Abs: 1.989 | Max: 6.494
[LOSS Ex2] A: 0.13473 | B: 0.35766 | C: 0.23608
** [JOINT LOSS] ** : 0.881932
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.052019
  -> Layer: shared_layers.0.bias | Grad Mean: 0.085002 | Grad Max: 0.423141
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.005569
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005104 | Grad Max: 0.005104
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000668 | Grad Max: 0.152737
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011485 | Grad Max: 0.859001
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.003024
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002015 | Grad Max: 0.020445
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000140
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000374 | Grad Max: 0.002234
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000057
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000094 | Grad Max: 0.000538
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000449 | Grad Max: 0.001196
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001049 | Grad Max: 0.001049
[GRADIENT NORM TOTAL] 2.4969

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.606
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5425833 0.4574167] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.063
[MASKS] A(Pass/Fail): 666/1382 | B: 612/1436 | C: 472/1576
[LOSS Ex1] A: 0.64892 | B: 0.63265 | C: 0.63235
[LOGITS Ex2 A] Mean Abs: 2.009 | Max: 6.570
[LOSS Ex2] A: 0.14367 | B: 0.34499 | C: 0.25635
** [JOINT LOSS] ** : 0.886313
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005767 | Grad Max: 0.154884
  -> Layer: shared_layers.0.bias | Grad Mean: 0.413429 | Grad Max: 1.972119
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.006595
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012174 | Grad Max: 0.012174
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002676 | Grad Max: 0.442663
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050079 | Grad Max: 2.480869
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000382 | Grad Max: 0.013943
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025047 | Grad Max: 0.145791
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000511
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005256 | Grad Max: 0.011051
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000255
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001388 | Grad Max: 0.003275
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001194 | Grad Max: 0.002699
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026494 | Grad Max: 0.026494
[GRADIENT NORM TOTAL] 8.8657

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.764
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.73289424 0.26710573] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.066
[MASKS] A(Pass/Fail): 721/1327 | B: 573/1283 | C: 456/1592
[LOSS Ex1] A: 0.64331 | B: 0.63688 | C: 0.63417
[LOGITS Ex2 A] Mean Abs: 2.036 | Max: 6.124
[LOSS Ex2] A: 0.14465 | B: 0.33597 | C: 0.26662
** [JOINT LOSS] ** : 0.887198
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007653 | Grad Max: 0.212161
  -> Layer: shared_layers.0.bias | Grad Mean: 0.410863 | Grad Max: 1.542050
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.006063
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005065 | Grad Max: 0.005065
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002699 | Grad Max: 0.382969
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049674 | Grad Max: 2.132585
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000385 | Grad Max: 0.012168
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024802 | Grad Max: 0.132445
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000495
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005315 | Grad Max: 0.010985
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000260
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001407 | Grad Max: 0.003356
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001254 | Grad Max: 0.002632
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026988 | Grad Max: 0.026988
[GRADIENT NORM TOTAL] 8.2325

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.849
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008316 0.4991684] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.065
[MASKS] A(Pass/Fail): 704/1344 | B: 615/1433 | C: 512/1536
[LOSS Ex1] A: 0.64962 | B: 0.63598 | C: 0.62640
[LOGITS Ex2 A] Mean Abs: 2.028 | Max: 5.770
[LOSS Ex2] A: 0.12182 | B: 0.36233 | C: 0.25004
** [JOINT LOSS] ** : 0.882062
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002433 | Grad Max: 0.074010
  -> Layer: shared_layers.0.bias | Grad Mean: 0.135247 | Grad Max: 0.595598
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.006088
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002839 | Grad Max: 0.002839
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000837 | Grad Max: 0.446815
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014504 | Grad Max: 2.480123
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.004337
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004479 | Grad Max: 0.036868
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000212
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001027 | Grad Max: 0.003508
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000101
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000281 | Grad Max: 0.000982
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000373 | Grad Max: 0.001182
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006165 | Grad Max: 0.006165
[GRADIENT NORM TOTAL] 4.0189

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.585
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.68518806 0.31481192] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.066
[MASKS] A(Pass/Fail): 690/1358 | B: 610/1438 | C: 309/1067
[LOSS Ex1] A: 0.64574 | B: 0.63673 | C: 0.62918
[LOGITS Ex2 A] Mean Abs: 1.995 | Max: 6.400
[LOSS Ex2] A: 0.14671 | B: 0.36555 | C: 0.26040
** [JOINT LOSS] ** : 0.894771
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007010 | Grad Max: 0.205263
  -> Layer: shared_layers.0.bias | Grad Mean: 0.302700 | Grad Max: 1.328791
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.005736
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006249 | Grad Max: 0.006249
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.346935
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037540 | Grad Max: 1.962603
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000299 | Grad Max: 0.010264
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019079 | Grad Max: 0.105194
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000442
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004112 | Grad Max: 0.008395
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000249
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001086 | Grad Max: 0.002585
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001024 | Grad Max: 0.002113
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020763 | Grad Max: 0.020763
[GRADIENT NORM TOTAL] 6.2918

[EPOCH SUMMARY] Train Loss: 0.8896

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8685 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8707 -> New: 0.8685)

############################## EPOCH 116/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.683
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6141272  0.38587278] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.066
[MASKS] A(Pass/Fail): 582/1034 | B: 612/1436 | C: 469/1579
[LOSS Ex1] A: 0.64406 | B: 0.63247 | C: 0.63019
[LOGITS Ex2 A] Mean Abs: 2.039 | Max: 6.828
[LOSS Ex2] A: 0.13485 | B: 0.34244 | C: 0.26914
** [JOINT LOSS] ** : 0.884382
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004007 | Grad Max: 0.133813
  -> Layer: shared_layers.0.bias | Grad Mean: 0.102221 | Grad Max: 0.472252
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002223 | Grad Max: 0.006191
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000854 | Grad Max: 0.000854
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000788 | Grad Max: 0.241629
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013871 | Grad Max: 1.341937
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.004018
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005068 | Grad Max: 0.030859
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000211
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001109 | Grad Max: 0.003285
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000307 | Grad Max: 0.000979
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000352 | Grad Max: 0.001224
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006850 | Grad Max: 0.006850
[GRADIENT NORM TOTAL] 2.7974

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.851
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072864  0.49271357] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.067
[MASKS] A(Pass/Fail): 700/1348 | B: 573/1283 | C: 463/1585
[LOSS Ex1] A: 0.64462 | B: 0.63670 | C: 0.63211
[LOGITS Ex2 A] Mean Abs: 2.046 | Max: 7.474
[LOSS Ex2] A: 0.12980 | B: 0.34813 | C: 0.26650
** [JOINT LOSS] ** : 0.885956
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004396 | Grad Max: 0.157146
  -> Layer: shared_layers.0.bias | Grad Mean: 0.380320 | Grad Max: 2.015921
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.005996
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001186 | Grad Max: 0.001186
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002494 | Grad Max: 0.417714
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045625 | Grad Max: 2.325788
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000342 | Grad Max: 0.012854
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022416 | Grad Max: 0.131672
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000474
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004773 | Grad Max: 0.010799
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000222
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001256 | Grad Max: 0.002958
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001015 | Grad Max: 0.002400
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022913 | Grad Max: 0.022913
[GRADIENT NORM TOTAL] 8.3392

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.802
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51072824 0.48927176] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.066
[MASKS] A(Pass/Fail): 692/1356 | B: 615/1433 | C: 443/1605
[LOSS Ex1] A: 0.64172 | B: 0.63581 | C: 0.63101
[LOGITS Ex2 A] Mean Abs: 2.055 | Max: 6.146
[LOSS Ex2] A: 0.13592 | B: 0.35747 | C: 0.26489
** [JOINT LOSS] ** : 0.888942
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005166 | Grad Max: 0.150665
  -> Layer: shared_layers.0.bias | Grad Mean: 0.339502 | Grad Max: 1.674165
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006531
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007798 | Grad Max: 0.007798
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002279 | Grad Max: 0.315985
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041994 | Grad Max: 1.763909
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000323 | Grad Max: 0.011845
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021050 | Grad Max: 0.109777
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000505
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004479 | Grad Max: 0.010217
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000217
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001160 | Grad Max: 0.002821
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000968 | Grad Max: 0.002253
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020991 | Grad Max: 0.020991
[GRADIENT NORM TOTAL] 7.2109

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.827
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50382143 0.49617857] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.066
[MASKS] A(Pass/Fail): 692/1356 | B: 610/1438 | C: 482/1566
[LOSS Ex1] A: 0.64072 | B: 0.63656 | C: 0.62889
[LOGITS Ex2 A] Mean Abs: 1.996 | Max: 7.559
[LOSS Ex2] A: 0.15138 | B: 0.35887 | C: 0.25406
** [JOINT LOSS] ** : 0.890160
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002928 | Grad Max: 0.119263
  -> Layer: shared_layers.0.bias | Grad Mean: 0.061432 | Grad Max: 0.239146
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002248 | Grad Max: 0.006712
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000249 | Grad Max: 0.000249
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000732 | Grad Max: 0.153026
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011756 | Grad Max: 0.843758
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.003152
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002180 | Grad Max: 0.016998
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000123
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000289 | Grad Max: 0.001920
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000049
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000075 | Grad Max: 0.000431
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000292 | Grad Max: 0.000703
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000232 | Grad Max: 0.000232
[GRADIENT NORM TOTAL] 2.2881

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.685
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025766 0.4974234] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.065
[MASKS] A(Pass/Fail): 667/1381 | B: 614/1434 | C: 507/1541
[LOSS Ex1] A: 0.64872 | B: 0.63229 | C: 0.62529
[LOGITS Ex2 A] Mean Abs: 1.962 | Max: 5.880
[LOSS Ex2] A: 0.13275 | B: 0.34854 | C: 0.25407
** [JOINT LOSS] ** : 0.880557
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005037 | Grad Max: 0.122446
  -> Layer: shared_layers.0.bias | Grad Mean: 0.377017 | Grad Max: 1.465725
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.005875
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003805 | Grad Max: 0.003805
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002440 | Grad Max: 0.275116
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045448 | Grad Max: 1.553258
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000374 | Grad Max: 0.013870
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024626 | Grad Max: 0.145578
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000521
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005166 | Grad Max: 0.011199
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000273
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001326 | Grad Max: 0.003239
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001112 | Grad Max: 0.002562
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023929 | Grad Max: 0.023929
[GRADIENT NORM TOTAL] 7.4989

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.610
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54272366 0.45727637] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.550 | Std: 0.063
[MASKS] A(Pass/Fail): 666/1382 | B: 574/1282 | C: 495/1553
[LOSS Ex1] A: 0.64870 | B: 0.63653 | C: 0.63062
[LOGITS Ex2 A] Mean Abs: 1.964 | Max: 5.879
[LOSS Ex2] A: 0.13776 | B: 0.34298 | C: 0.28201
** [JOINT LOSS] ** : 0.892868
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002984 | Grad Max: 0.096877
  -> Layer: shared_layers.0.bias | Grad Mean: 0.198097 | Grad Max: 0.638449
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.005928
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007559 | Grad Max: 0.007559
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001289 | Grad Max: 0.133694
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023195 | Grad Max: 0.748247
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000177 | Grad Max: 0.006317
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011480 | Grad Max: 0.053745
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000264
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002502 | Grad Max: 0.005828
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000163
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000670 | Grad Max: 0.001893
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000623 | Grad Max: 0.001520
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013405 | Grad Max: 0.013405
[GRADIENT NORM TOTAL] 3.9077

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.770
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.73423314 0.2657668 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.067
[MASKS] A(Pass/Fail): 722/1326 | B: 615/1433 | C: 454/1594
[LOSS Ex1] A: 0.64306 | B: 0.63563 | C: 0.63142
[LOGITS Ex2 A] Mean Abs: 2.048 | Max: 6.431
[LOSS Ex2] A: 0.12726 | B: 0.35988 | C: 0.26893
** [JOINT LOSS] ** : 0.888725
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005916 | Grad Max: 0.147009
  -> Layer: shared_layers.0.bias | Grad Mean: 0.415086 | Grad Max: 1.792737
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.006297
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007258 | Grad Max: 0.007258
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002714 | Grad Max: 0.290657
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050576 | Grad Max: 1.635965
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.013188
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026060 | Grad Max: 0.135441
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000536
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005536 | Grad Max: 0.011528
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000239
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001458 | Grad Max: 0.003312
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001239 | Grad Max: 0.002708
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027399 | Grad Max: 0.027399
[GRADIENT NORM TOTAL] 8.3678

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.855
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008498  0.49915025] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.066
[MASKS] A(Pass/Fail): 704/1344 | B: 610/1438 | C: 461/1587
[LOSS Ex1] A: 0.64940 | B: 0.63638 | C: 0.62761
[LOGITS Ex2 A] Mean Abs: 2.071 | Max: 6.098
[LOSS Ex2] A: 0.13511 | B: 0.36397 | C: 0.25306
** [JOINT LOSS] ** : 0.888512
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005464 | Grad Max: 0.205028
  -> Layer: shared_layers.0.bias | Grad Mean: 0.567176 | Grad Max: 2.585366
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005530
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003574 | Grad Max: 0.003574
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003530 | Grad Max: 0.457706
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.066178 | Grad Max: 2.561552
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000531 | Grad Max: 0.016744
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035250 | Grad Max: 0.183535
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000623
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007212 | Grad Max: 0.014050
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000317
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001909 | Grad Max: 0.004155
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001572 | Grad Max: 0.003360
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035870 | Grad Max: 0.035870
[GRADIENT NORM TOTAL] 11.6550

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.589
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.68620825 0.31379178] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.066
[MASKS] A(Pass/Fail): 690/1358 | B: 614/1434 | C: 477/1571
[LOSS Ex1] A: 0.64550 | B: 0.63211 | C: 0.62830
[LOGITS Ex2 A] Mean Abs: 2.023 | Max: 6.050
[LOSS Ex2] A: 0.14664 | B: 0.34128 | C: 0.25888
** [JOINT LOSS] ** : 0.884236
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002573 | Grad Max: 0.066361
  -> Layer: shared_layers.0.bias | Grad Mean: 0.201339 | Grad Max: 0.817155
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.005795
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001832 | Grad Max: 0.001832
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001553 | Grad Max: 0.280590
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028158 | Grad Max: 1.570699
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000214 | Grad Max: 0.008475
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014000 | Grad Max: 0.088571
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000322
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002760 | Grad Max: 0.006329
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000151
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000728 | Grad Max: 0.001704
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000599 | Grad Max: 0.001548
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013530 | Grad Max: 0.013530
[GRADIENT NORM TOTAL] 4.8240

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.687
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6146742  0.38532573] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.067
[MASKS] A(Pass/Fail): 582/1034 | B: 575/1281 | C: 466/1582
[LOSS Ex1] A: 0.64381 | B: 0.63635 | C: 0.63107
[LOGITS Ex2 A] Mean Abs: 2.016 | Max: 5.822
[LOSS Ex2] A: 0.13664 | B: 0.35931 | C: 0.26685
** [JOINT LOSS] ** : 0.891347
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007634 | Grad Max: 0.199615
  -> Layer: shared_layers.0.bias | Grad Mean: 0.588634 | Grad Max: 2.671726
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.006157
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003764 | Grad Max: 0.003764
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003843 | Grad Max: 0.605954
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.071244 | Grad Max: 3.378571
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000555 | Grad Max: 0.017299
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036874 | Grad Max: 0.183848
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000716
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007864 | Grad Max: 0.015887
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000372
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002091 | Grad Max: 0.005027
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001829 | Grad Max: 0.003236
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039769 | Grad Max: 0.039769
[GRADIENT NORM TOTAL] 12.3238

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.856
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073193 0.4926808] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.067
[MASKS] A(Pass/Fail): 701/1347 | B: 615/1433 | C: 473/1575
[LOSS Ex1] A: 0.64438 | B: 0.63547 | C: 0.63321
[LOGITS Ex2 A] Mean Abs: 1.990 | Max: 6.689
[LOSS Ex2] A: 0.13850 | B: 0.40267 | C: 0.27725
** [JOINT LOSS] ** : 0.910496
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010120 | Grad Max: 0.257573
  -> Layer: shared_layers.0.bias | Grad Mean: 0.769439 | Grad Max: 3.486998
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.005918
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002973 | Grad Max: 0.002973
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005087 | Grad Max: 0.727261
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.094249 | Grad Max: 4.054877
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000722 | Grad Max: 0.024820
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047819 | Grad Max: 0.254872
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.000855
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010222 | Grad Max: 0.020083
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000466
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002731 | Grad Max: 0.006200
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002325 | Grad Max: 0.004519
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052133 | Grad Max: 0.052133
[GRADIENT NORM TOTAL] 16.1813

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.806
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51074547 0.48925453] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.067
[MASKS] A(Pass/Fail): 692/1356 | B: 610/1438 | C: 444/1604
[LOSS Ex1] A: 0.64147 | B: 0.63622 | C: 0.63512
[LOGITS Ex2 A] Mean Abs: 1.989 | Max: 5.797
[LOSS Ex2] A: 0.13208 | B: 0.37505 | C: 0.25319
** [JOINT LOSS] ** : 0.891042
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004945 | Grad Max: 0.157434
  -> Layer: shared_layers.0.bias | Grad Mean: 0.473878 | Grad Max: 2.211975
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002293 | Grad Max: 0.006511
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009663 | Grad Max: 0.009663
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003040 | Grad Max: 0.387090
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056335 | Grad Max: 2.180807
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000436 | Grad Max: 0.016591
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029140 | Grad Max: 0.165657
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000545
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006173 | Grad Max: 0.012750
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000284
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001636 | Grad Max: 0.003873
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001358 | Grad Max: 0.002650
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030706 | Grad Max: 0.030706
[GRADIENT NORM TOTAL] 9.8081

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.831
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5038373 0.4961627] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.066
[MASKS] A(Pass/Fail): 694/1354 | B: 615/1433 | C: 482/1566
[LOSS Ex1] A: 0.64048 | B: 0.63196 | C: 0.62631
[LOGITS Ex2 A] Mean Abs: 2.021 | Max: 6.375
[LOSS Ex2] A: 0.15163 | B: 0.33841 | C: 0.23733
** [JOINT LOSS] ** : 0.875368
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003710 | Grad Max: 0.115586
  -> Layer: shared_layers.0.bias | Grad Mean: 0.145722 | Grad Max: 0.542652
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002390 | Grad Max: 0.006779
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005016 | Grad Max: 0.005016
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001116 | Grad Max: 0.241607
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020066 | Grad Max: 1.353856
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000113 | Grad Max: 0.004117
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007195 | Grad Max: 0.034037
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000233
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001614 | Grad Max: 0.004495
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000106
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000436 | Grad Max: 0.001293
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000425 | Grad Max: 0.001623
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008475 | Grad Max: 0.008475
[GRADIENT NORM TOTAL] 3.7821

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.689
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50253856 0.49746147] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.551 | Std: 0.065
[MASKS] A(Pass/Fail): 668/1380 | B: 576/1280 | C: 306/1070
[LOSS Ex1] A: 0.64851 | B: 0.63621 | C: 0.63379
[LOGITS Ex2 A] Mean Abs: 2.001 | Max: 6.182
[LOSS Ex2] A: 0.13341 | B: 0.34026 | C: 0.23674
** [JOINT LOSS] ** : 0.876301
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005787 | Grad Max: 0.177016
  -> Layer: shared_layers.0.bias | Grad Mean: 0.406778 | Grad Max: 2.024339
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.005877
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001289 | Grad Max: 0.001289
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002616 | Grad Max: 0.427551
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048767 | Grad Max: 2.383898
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000355 | Grad Max: 0.011562
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023472 | Grad Max: 0.117552
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000498
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005018 | Grad Max: 0.010435
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000216
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001376 | Grad Max: 0.002896
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001137 | Grad Max: 0.002998
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026785 | Grad Max: 0.026785
[GRADIENT NORM TOTAL] 8.8666

[EPOCH SUMMARY] Train Loss: 0.8878

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8693 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 117/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.614
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54280216 0.45719787] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.063
[MASKS] A(Pass/Fail): 667/1381 | B: 615/1433 | C: 462/1586
[LOSS Ex1] A: 0.64850 | B: 0.63533 | C: 0.63064
[LOGITS Ex2 A] Mean Abs: 1.973 | Max: 5.747
[LOSS Ex2] A: 0.13569 | B: 0.35807 | C: 0.27798
** [JOINT LOSS] ** : 0.895401
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003310 | Grad Max: 0.108033
  -> Layer: shared_layers.0.bias | Grad Mean: 0.282359 | Grad Max: 1.420349
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.006036
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008490 | Grad Max: 0.008490
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001822 | Grad Max: 0.312554
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033818 | Grad Max: 1.746763
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000240 | Grad Max: 0.009457
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015985 | Grad Max: 0.083717
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000344
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003330 | Grad Max: 0.007642
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000150
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000864 | Grad Max: 0.002054
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000667 | Grad Max: 0.001721
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015518 | Grad Max: 0.015518
[GRADIENT NORM TOTAL] 6.3170

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.774
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.73535657 0.26464346] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.067
[MASKS] A(Pass/Fail): 722/1326 | B: 610/1438 | C: 462/1586
[LOSS Ex1] A: 0.64284 | B: 0.63608 | C: 0.63584
[LOGITS Ex2 A] Mean Abs: 1.981 | Max: 5.957
[LOSS Ex2] A: 0.12397 | B: 0.36228 | C: 0.27396
** [JOINT LOSS] ** : 0.891661
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002696 | Grad Max: 0.084339
  -> Layer: shared_layers.0.bias | Grad Mean: 0.182847 | Grad Max: 0.794398
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002224 | Grad Max: 0.006110
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007461 | Grad Max: 0.007461
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001196 | Grad Max: 0.201089
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021676 | Grad Max: 1.110722
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000160 | Grad Max: 0.006746
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010575 | Grad Max: 0.064636
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000258
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002123 | Grad Max: 0.005180
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000119
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000577 | Grad Max: 0.001523
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000481 | Grad Max: 0.001477
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011240 | Grad Max: 0.011240
[GRADIENT NORM TOTAL] 3.8416

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.860
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008404  0.49915963] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.066
[MASKS] A(Pass/Fail): 705/1343 | B: 615/1433 | C: 467/1581
[LOSS Ex1] A: 0.64920 | B: 0.63182 | C: 0.63024
[LOGITS Ex2 A] Mean Abs: 1.998 | Max: 5.824
[LOSS Ex2] A: 0.12094 | B: 0.34612 | C: 0.23995
** [JOINT LOSS] ** : 0.872755
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004689 | Grad Max: 0.144382
  -> Layer: shared_layers.0.bias | Grad Mean: 0.251673 | Grad Max: 0.886535
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.005803
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005695 | Grad Max: 0.005695
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001749 | Grad Max: 0.245642
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032250 | Grad Max: 1.346692
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000235 | Grad Max: 0.008237
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015523 | Grad Max: 0.091043
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000367
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003330 | Grad Max: 0.007113
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000181
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000876 | Grad Max: 0.002194
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000778 | Grad Max: 0.002073
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016820 | Grad Max: 0.016820
[GRADIENT NORM TOTAL] 5.3683

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.593
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6869606  0.31303945] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.067
[MASKS] A(Pass/Fail): 690/1358 | B: 577/1279 | C: 456/1592
[LOSS Ex1] A: 0.64529 | B: 0.63607 | C: 0.62849
[LOGITS Ex2 A] Mean Abs: 1.995 | Max: 5.849
[LOSS Ex2] A: 0.14597 | B: 0.33664 | C: 0.24967
** [JOINT LOSS] ** : 0.880711
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002596 | Grad Max: 0.060273
  -> Layer: shared_layers.0.bias | Grad Mean: 0.111215 | Grad Max: 0.517684
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.006069
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000991 | Grad Max: 0.000991
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000742 | Grad Max: 0.141450
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012202 | Grad Max: 0.749668
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000066 | Grad Max: 0.004446
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003730 | Grad Max: 0.033703
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000180
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000742 | Grad Max: 0.003089
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000072
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000215 | Grad Max: 0.000847
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000371 | Grad Max: 0.001234
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005572 | Grad Max: 0.005572
[GRADIENT NORM TOTAL] 2.5386

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.691
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6150381  0.38496187] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.067
[MASKS] A(Pass/Fail): 582/1034 | B: 615/1433 | C: 453/1595
[LOSS Ex1] A: 0.64360 | B: 0.63519 | C: 0.63253
[LOGITS Ex2 A] Mean Abs: 2.063 | Max: 6.850
[LOSS Ex2] A: 0.12162 | B: 0.35808 | C: 0.25980
** [JOINT LOSS] ** : 0.883605
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002691 | Grad Max: 0.083468
  -> Layer: shared_layers.0.bias | Grad Mean: 0.238993 | Grad Max: 1.060120
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.006386
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003840 | Grad Max: 0.003840
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001528 | Grad Max: 0.197251
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028159 | Grad Max: 1.091436
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000215 | Grad Max: 0.009704
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014285 | Grad Max: 0.087187
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000305
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002985 | Grad Max: 0.006870
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000158
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000769 | Grad Max: 0.001986
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000554 | Grad Max: 0.001848
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012782 | Grad Max: 0.012782
[GRADIENT NORM TOTAL] 4.9117

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.861
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073497 0.4926502] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.067
[MASKS] A(Pass/Fail): 701/1347 | B: 613/1435 | C: 488/1560
[LOSS Ex1] A: 0.64417 | B: 0.63594 | C: 0.62781
[LOGITS Ex2 A] Mean Abs: 2.034 | Max: 8.542
[LOSS Ex2] A: 0.12539 | B: 0.35052 | C: 0.26361
** [JOINT LOSS] ** : 0.882481
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003037 | Grad Max: 0.144780
  -> Layer: shared_layers.0.bias | Grad Mean: 0.167506 | Grad Max: 0.814997
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.005727
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003009 | Grad Max: 0.003009
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000942 | Grad Max: 0.421650
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016940 | Grad Max: 2.354693
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.004760
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004847 | Grad Max: 0.043996
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000203
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001002 | Grad Max: 0.003193
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000081
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000254 | Grad Max: 0.000807
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000356 | Grad Max: 0.001236
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004505 | Grad Max: 0.004505
[GRADIENT NORM TOTAL] 4.5483

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.811
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51066923 0.4893308 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.067
[MASKS] A(Pass/Fail): 692/1356 | B: 615/1433 | C: 510/1538
[LOSS Ex1] A: 0.64125 | B: 0.63167 | C: 0.62555
[LOGITS Ex2 A] Mean Abs: 2.017 | Max: 6.240
[LOSS Ex2] A: 0.12901 | B: 0.34481 | C: 0.26554
** [JOINT LOSS] ** : 0.879275
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002009 | Grad Max: 0.055756
  -> Layer: shared_layers.0.bias | Grad Mean: 0.133702 | Grad Max: 0.787369
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002467 | Grad Max: 0.006466
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008749 | Grad Max: 0.008749
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001036 | Grad Max: 0.251035
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018543 | Grad Max: 1.386706
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000096 | Grad Max: 0.004232
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006091 | Grad Max: 0.036515
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000182
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001238 | Grad Max: 0.003796
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000098
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000329 | Grad Max: 0.001058
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.001167
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006092 | Grad Max: 0.006092
[GRADIENT NORM TOTAL] 3.9182

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.836
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50393134 0.49606866] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.067
[MASKS] A(Pass/Fail): 694/1354 | B: 577/1279 | C: 469/1579
[LOSS Ex1] A: 0.64026 | B: 0.63591 | C: 0.62873
[LOGITS Ex2 A] Mean Abs: 1.995 | Max: 6.157
[LOSS Ex2] A: 0.15203 | B: 0.33951 | C: 0.23877
** [JOINT LOSS] ** : 0.878406
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003813 | Grad Max: 0.158306
  -> Layer: shared_layers.0.bias | Grad Mean: 0.081935 | Grad Max: 0.329071
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002280 | Grad Max: 0.006153
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002334 | Grad Max: 0.002334
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000770 | Grad Max: 0.182728
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011900 | Grad Max: 1.012790
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.003810
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002323 | Grad Max: 0.026332
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000161
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000469 | Grad Max: 0.002983
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000069
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000129 | Grad Max: 0.000605
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000378 | Grad Max: 0.001093
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001723 | Grad Max: 0.001723
[GRADIENT NORM TOTAL] 2.2884

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.693
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025383  0.49746168] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.066
[MASKS] A(Pass/Fail): 668/1380 | B: 615/1433 | C: 481/1567
[LOSS Ex1] A: 0.64830 | B: 0.63502 | C: 0.62874
[LOGITS Ex2 A] Mean Abs: 1.984 | Max: 5.510
[LOSS Ex2] A: 0.12722 | B: 0.36023 | C: 0.24143
** [JOINT LOSS] ** : 0.880313
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002245 | Grad Max: 0.051559
  -> Layer: shared_layers.0.bias | Grad Mean: 0.082108 | Grad Max: 0.585899
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.005935
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006808 | Grad Max: 0.006808
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000597 | Grad Max: 0.097763
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010255 | Grad Max: 0.509453
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002833
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001802 | Grad Max: 0.016736
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000166
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000297 | Grad Max: 0.002063
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000076 | Grad Max: 0.000542
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.000942
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000106 | Grad Max: 0.000106
[GRADIENT NORM TOTAL] 1.9875

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.618
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54285675 0.45714328] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.064
[MASKS] A(Pass/Fail): 667/1381 | B: 613/1435 | C: 472/1576
[LOSS Ex1] A: 0.64829 | B: 0.63576 | C: 0.63155
[LOGITS Ex2 A] Mean Abs: 1.972 | Max: 5.889
[LOSS Ex2] A: 0.13702 | B: 0.35788 | C: 0.25902
** [JOINT LOSS] ** : 0.889834
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003285 | Grad Max: 0.109914
  -> Layer: shared_layers.0.bias | Grad Mean: 0.129786 | Grad Max: 0.651775
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.005853
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007357 | Grad Max: 0.007357
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001006 | Grad Max: 0.224363
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017123 | Grad Max: 1.248864
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000083 | Grad Max: 0.004963
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004741 | Grad Max: 0.036310
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000188
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000841 | Grad Max: 0.003578
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000069
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000220 | Grad Max: 0.000795
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000352 | Grad Max: 0.000950
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003912 | Grad Max: 0.003912
[GRADIENT NORM TOTAL] 3.4073

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.779
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7366611  0.26333892] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.067
[MASKS] A(Pass/Fail): 722/1326 | B: 615/1433 | C: 511/1537
[LOSS Ex1] A: 0.64261 | B: 0.63147 | C: 0.62404
[LOGITS Ex2 A] Mean Abs: 2.020 | Max: 5.884
[LOSS Ex2] A: 0.11526 | B: 0.34628 | C: 0.26260
** [JOINT LOSS] ** : 0.874087
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003082 | Grad Max: 0.066919
  -> Layer: shared_layers.0.bias | Grad Mean: 0.124325 | Grad Max: 0.460529
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002325 | Grad Max: 0.006659
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001158 | Grad Max: 0.001158
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000926 | Grad Max: 0.154986
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017218 | Grad Max: 0.864799
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.004569
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008072 | Grad Max: 0.042038
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000263
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001670 | Grad Max: 0.004791
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000125
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000434 | Grad Max: 0.001148
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000471 | Grad Max: 0.001461
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008143 | Grad Max: 0.008143
[GRADIENT NORM TOTAL] 2.8305

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.865
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008229  0.49917713] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.066
[MASKS] A(Pass/Fail): 705/1343 | B: 577/1279 | C: 449/1599
[LOSS Ex1] A: 0.64898 | B: 0.63571 | C: 0.63036
[LOGITS Ex2 A] Mean Abs: 2.045 | Max: 6.122
[LOSS Ex2] A: 0.12510 | B: 0.33710 | C: 0.27229
** [JOINT LOSS] ** : 0.883177
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.064026
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134315 | Grad Max: 0.725793
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.005782
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002564 | Grad Max: 0.002564
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000995 | Grad Max: 0.198799
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017991 | Grad Max: 1.113783
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.006542
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007496 | Grad Max: 0.053893
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000200
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001442 | Grad Max: 0.004185
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000376 | Grad Max: 0.001240
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000297 | Grad Max: 0.001213
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006868 | Grad Max: 0.006868
[GRADIENT NORM TOTAL] 3.2013

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.597
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.68784106 0.31215897] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.067
[MASKS] A(Pass/Fail): 690/1358 | B: 615/1433 | C: 452/1596
[LOSS Ex1] A: 0.64504 | B: 0.63481 | C: 0.63239
[LOGITS Ex2 A] Mean Abs: 2.026 | Max: 6.399
[LOSS Ex2] A: 0.14482 | B: 0.35572 | C: 0.25875
** [JOINT LOSS] ** : 0.890509
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002038 | Grad Max: 0.037767
  -> Layer: shared_layers.0.bias | Grad Mean: 0.088189 | Grad Max: 0.382013
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.005902
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001425 | Grad Max: 0.001425
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000654 | Grad Max: 0.457886
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011308 | Grad Max: 2.543863
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002419
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002039 | Grad Max: 0.018893
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000150
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000353 | Grad Max: 0.001975
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000058
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000095 | Grad Max: 0.000503
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000228 | Grad Max: 0.000863
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001772 | Grad Max: 0.001772
[GRADIENT NORM TOTAL] 3.5553

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.696
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61554265 0.38445738] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.067
[MASKS] A(Pass/Fail): 582/1034 | B: 613/1435 | C: 313/1063
[LOSS Ex1] A: 0.64334 | B: 0.63553 | C: 0.62897
[LOGITS Ex2 A] Mean Abs: 2.082 | Max: 7.467
[LOSS Ex2] A: 0.12561 | B: 0.35612 | C: 0.25399
** [JOINT LOSS] ** : 0.881186
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002313 | Grad Max: 0.038081
  -> Layer: shared_layers.0.bias | Grad Mean: 0.107075 | Grad Max: 0.529798
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.006354
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002872 | Grad Max: 0.002872
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000778 | Grad Max: 0.195333
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013538 | Grad Max: 1.101705
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.004487
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003889 | Grad Max: 0.032440
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000174
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000763 | Grad Max: 0.003038
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000070
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000214 | Grad Max: 0.000781
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000333 | Grad Max: 0.001120
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004629 | Grad Max: 0.004629
[GRADIENT NORM TOTAL] 2.7648

[EPOCH SUMMARY] Train Loss: 0.8831

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8650 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8685 -> New: 0.8650)

############################## EPOCH 118/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.868
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073756  0.49262443] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.068
[MASKS] A(Pass/Fail): 702/1346 | B: 615/1433 | C: 476/1572
[LOSS Ex1] A: 0.64391 | B: 0.63123 | C: 0.62546
[LOGITS Ex2 A] Mean Abs: 2.079 | Max: 8.594
[LOSS Ex2] A: 0.12453 | B: 0.34479 | C: 0.23734
** [JOINT LOSS] ** : 0.869086
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003564 | Grad Max: 0.123815
  -> Layer: shared_layers.0.bias | Grad Mean: 0.127870 | Grad Max: 0.624806
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002329 | Grad Max: 0.006112
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004159 | Grad Max: 0.004159
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001185 | Grad Max: 0.286979
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021083 | Grad Max: 1.574776
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000130 | Grad Max: 0.004044
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008150 | Grad Max: 0.035002
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000272
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001840 | Grad Max: 0.004662
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000128
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000506 | Grad Max: 0.001191
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000508 | Grad Max: 0.001611
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010120 | Grad Max: 0.010120
[GRADIENT NORM TOTAL] 3.7276

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.817
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5106333  0.48936668] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.067
[MASKS] A(Pass/Fail): 692/1356 | B: 578/1278 | C: 484/1564
[LOSS Ex1] A: 0.64097 | B: 0.63547 | C: 0.62903
[LOGITS Ex2 A] Mean Abs: 2.076 | Max: 6.126
[LOSS Ex2] A: 0.13766 | B: 0.33150 | C: 0.29036
** [JOINT LOSS] ** : 0.888326
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.042046
  -> Layer: shared_layers.0.bias | Grad Mean: 0.101142 | Grad Max: 0.531291
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002263 | Grad Max: 0.006226
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004372 | Grad Max: 0.004372
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000832 | Grad Max: 0.166197
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014489 | Grad Max: 0.894474
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.004391
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004247 | Grad Max: 0.033946
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000182
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000913 | Grad Max: 0.002952
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000074
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000268 | Grad Max: 0.001039
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000198 | Grad Max: 0.000626
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005498 | Grad Max: 0.005498
[GRADIENT NORM TOTAL] 2.5857

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.843
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040401  0.49595985] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.067
[MASKS] A(Pass/Fail): 694/1354 | B: 618/1430 | C: 482/1566
[LOSS Ex1] A: 0.63996 | B: 0.63457 | C: 0.62932
[LOGITS Ex2 A] Mean Abs: 2.062 | Max: 7.392
[LOSS Ex2] A: 0.13704 | B: 0.36123 | C: 0.23973
** [JOINT LOSS] ** : 0.880616
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002386 | Grad Max: 0.062750
  -> Layer: shared_layers.0.bias | Grad Mean: 0.062765 | Grad Max: 0.277261
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.006702
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002735 | Grad Max: 0.002735
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000665 | Grad Max: 0.158514
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011061 | Grad Max: 0.870404
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.002454
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002186 | Grad Max: 0.021681
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000115
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000380 | Grad Max: 0.002361
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000077
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000107 | Grad Max: 0.000725
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000271 | Grad Max: 0.001052
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002104 | Grad Max: 0.002104
[GRADIENT NORM TOTAL] 2.1017

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.699
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50252694 0.49747303] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.066
[MASKS] A(Pass/Fail): 669/1379 | B: 613/1435 | C: 475/1573
[LOSS Ex1] A: 0.64801 | B: 0.63528 | C: 0.63063
[LOGITS Ex2 A] Mean Abs: 2.059 | Max: 6.622
[LOSS Ex2] A: 0.12278 | B: 0.35417 | C: 0.25991
** [JOINT LOSS] ** : 0.883596
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003193 | Grad Max: 0.117395
  -> Layer: shared_layers.0.bias | Grad Mean: 0.228976 | Grad Max: 1.395059
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.006065
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008917 | Grad Max: 0.008917
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001463 | Grad Max: 0.257280
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025811 | Grad Max: 1.435367
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000142 | Grad Max: 0.007003
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008843 | Grad Max: 0.070127
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000242
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001634 | Grad Max: 0.005038
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000094
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000401 | Grad Max: 0.001157
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000321 | Grad Max: 0.001090
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006301 | Grad Max: 0.006301
[GRADIENT NORM TOTAL] 5.1017

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.623
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.542925   0.45707506] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.064
[MASKS] A(Pass/Fail): 667/1381 | B: 615/1433 | C: 460/1588
[LOSS Ex1] A: 0.64801 | B: 0.63098 | C: 0.62984
[LOGITS Ex2 A] Mean Abs: 2.019 | Max: 5.710
[LOSS Ex2] A: 0.12503 | B: 0.34189 | C: 0.25494
** [JOINT LOSS] ** : 0.876896
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004426 | Grad Max: 0.160879
  -> Layer: shared_layers.0.bias | Grad Mean: 0.150860 | Grad Max: 0.766975
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002147 | Grad Max: 0.006265
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009393 | Grad Max: 0.009393
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001177 | Grad Max: 0.194647
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019706 | Grad Max: 1.012411
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.006114
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005883 | Grad Max: 0.051696
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000179
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001046 | Grad Max: 0.003218
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000103
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000282 | Grad Max: 0.000969
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000379 | Grad Max: 0.001280
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005297 | Grad Max: 0.005297
[GRADIENT NORM TOTAL] 3.5320

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.786
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7385293  0.26147068] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.068
[MASKS] A(Pass/Fail): 724/1324 | B: 578/1278 | C: 477/1571
[LOSS Ex1] A: 0.64230 | B: 0.63521 | C: 0.62856
[LOGITS Ex2 A] Mean Abs: 2.044 | Max: 6.314
[LOSS Ex2] A: 0.13149 | B: 0.34454 | C: 0.27042
** [JOINT LOSS] ** : 0.884171
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005466 | Grad Max: 0.166292
  -> Layer: shared_layers.0.bias | Grad Mean: 0.290797 | Grad Max: 1.089170
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002264 | Grad Max: 0.006024
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005556 | Grad Max: 0.005556
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.266292
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034392 | Grad Max: 1.495508
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.008605
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018161 | Grad Max: 0.080530
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000402
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003950 | Grad Max: 0.008601
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000228
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001041 | Grad Max: 0.002686
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000915 | Grad Max: 0.001793
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019534 | Grad Max: 0.019534
[GRADIENT NORM TOTAL] 5.6737

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.873
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008069  0.49919307] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.067
[MASKS] A(Pass/Fail): 705/1343 | B: 620/1428 | C: 479/1569
[LOSS Ex1] A: 0.64868 | B: 0.63432 | C: 0.62772
[LOGITS Ex2 A] Mean Abs: 2.061 | Max: 5.831
[LOSS Ex2] A: 0.11687 | B: 0.35845 | C: 0.26304
** [JOINT LOSS] ** : 0.883024
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004430 | Grad Max: 0.128560
  -> Layer: shared_layers.0.bias | Grad Mean: 0.173631 | Grad Max: 0.818400
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002179 | Grad Max: 0.005690
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003337 | Grad Max: 0.003337
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001180 | Grad Max: 0.535256
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021201 | Grad Max: 2.992258
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.005949
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008098 | Grad Max: 0.051575
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000265
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001871 | Grad Max: 0.005031
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000115
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000515 | Grad Max: 0.001337
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000522 | Grad Max: 0.001490
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010415 | Grad Max: 0.010415
[GRADIENT NORM TOTAL] 4.8991

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.603
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6890237 0.3109764] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.067
[MASKS] A(Pass/Fail): 690/1358 | B: 613/1435 | C: 470/1578
[LOSS Ex1] A: 0.64471 | B: 0.63504 | C: 0.63199
[LOGITS Ex2 A] Mean Abs: 2.071 | Max: 5.784
[LOSS Ex2] A: 0.14455 | B: 0.35215 | C: 0.27540
** [JOINT LOSS] ** : 0.894619
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003459 | Grad Max: 0.115831
  -> Layer: shared_layers.0.bias | Grad Mean: 0.298520 | Grad Max: 1.532266
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.005909
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007841 | Grad Max: 0.007841
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.336028
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036836 | Grad Max: 1.888932
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000251 | Grad Max: 0.007852
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016768 | Grad Max: 0.071836
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000368
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003511 | Grad Max: 0.007969
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000182
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000882 | Grad Max: 0.002330
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000698 | Grad Max: 0.001696
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014771 | Grad Max: 0.014771
[GRADIENT NORM TOTAL] 6.8503

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.702
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61610585 0.38389418] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.068
[MASKS] A(Pass/Fail): 582/1034 | B: 615/1433 | C: 488/1560
[LOSS Ex1] A: 0.64300 | B: 0.63074 | C: 0.62621
[LOGITS Ex2 A] Mean Abs: 2.112 | Max: 6.304
[LOSS Ex2] A: 0.12166 | B: 0.33325 | C: 0.23975
** [JOINT LOSS] ** : 0.864871
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004241 | Grad Max: 0.131772
  -> Layer: shared_layers.0.bias | Grad Mean: 0.241557 | Grad Max: 0.928953
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002300 | Grad Max: 0.006336
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005991 | Grad Max: 0.005991
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001671 | Grad Max: 0.240553
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029924 | Grad Max: 1.237408
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.008245
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014858 | Grad Max: 0.081060
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000332
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003252 | Grad Max: 0.007119
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000169
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000844 | Grad Max: 0.002195
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000664 | Grad Max: 0.001918
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015208 | Grad Max: 0.015208
[GRADIENT NORM TOTAL] 5.0880

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.875
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074171 0.4925829] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.068
[MASKS] A(Pass/Fail): 702/1346 | B: 578/1278 | C: 480/1568
[LOSS Ex1] A: 0.64358 | B: 0.63499 | C: 0.62862
[LOGITS Ex2 A] Mean Abs: 2.077 | Max: 7.743
[LOSS Ex2] A: 0.12222 | B: 0.33976 | C: 0.27169
** [JOINT LOSS] ** : 0.880285
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003411 | Grad Max: 0.120736
  -> Layer: shared_layers.0.bias | Grad Mean: 0.217723 | Grad Max: 1.111595
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002233 | Grad Max: 0.006030
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004413 | Grad Max: 0.004413
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001474 | Grad Max: 0.240977
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026125 | Grad Max: 1.318183
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000175 | Grad Max: 0.007883
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011265 | Grad Max: 0.081759
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000260
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002085 | Grad Max: 0.004944
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000129
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000539 | Grad Max: 0.001631
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000440 | Grad Max: 0.001342
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010285 | Grad Max: 0.010285
[GRADIENT NORM TOTAL] 4.8720

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.824
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51054645 0.48945358] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.068
[MASKS] A(Pass/Fail): 692/1356 | B: 620/1428 | C: 468/1580
[LOSS Ex1] A: 0.64063 | B: 0.63411 | C: 0.63061
[LOGITS Ex2 A] Mean Abs: 2.064 | Max: 6.448
[LOSS Ex2] A: 0.12333 | B: 0.36329 | C: 0.26403
** [JOINT LOSS] ** : 0.885336
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004258 | Grad Max: 0.129563
  -> Layer: shared_layers.0.bias | Grad Mean: 0.338202 | Grad Max: 1.628029
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002219 | Grad Max: 0.006263
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000715 | Grad Max: 0.000715
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002226 | Grad Max: 0.363811
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040319 | Grad Max: 1.976565
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000282 | Grad Max: 0.011575
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018967 | Grad Max: 0.124275
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000406
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003759 | Grad Max: 0.008381
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000168
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000998 | Grad Max: 0.002274
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000840 | Grad Max: 0.001910
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019657 | Grad Max: 0.019657
[GRADIENT NORM TOTAL] 7.6177

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.850
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50418794 0.4958121 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.067
[MASKS] A(Pass/Fail): 695/1353 | B: 613/1435 | C: 457/1591
[LOSS Ex1] A: 0.63961 | B: 0.63484 | C: 0.63021
[LOGITS Ex2 A] Mean Abs: 2.055 | Max: 6.658
[LOSS Ex2] A: 0.14222 | B: 0.35407 | C: 0.26782
** [JOINT LOSS] ** : 0.889590
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005155 | Grad Max: 0.265093
  -> Layer: shared_layers.0.bias | Grad Mean: 0.180928 | Grad Max: 0.724091
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002273 | Grad Max: 0.006190
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004134 | Grad Max: 0.004134
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001389 | Grad Max: 0.236053
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023536 | Grad Max: 1.330280
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000152 | Grad Max: 0.006025
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009470 | Grad Max: 0.052010
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000269
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002226 | Grad Max: 0.005432
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000128
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000562 | Grad Max: 0.001394
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000473 | Grad Max: 0.001453
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009934 | Grad Max: 0.009934
[GRADIENT NORM TOTAL] 4.2578

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.705
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025715 0.4974285] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.066
[MASKS] A(Pass/Fail): 669/1379 | B: 615/1433 | C: 465/1583
[LOSS Ex1] A: 0.64771 | B: 0.63054 | C: 0.62708
[LOGITS Ex2 A] Mean Abs: 2.043 | Max: 6.659
[LOSS Ex2] A: 0.12550 | B: 0.34319 | C: 0.23397
** [JOINT LOSS] ** : 0.869329
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002533 | Grad Max: 0.087508
  -> Layer: shared_layers.0.bias | Grad Mean: 0.121051 | Grad Max: 0.699329
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.005677
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004151 | Grad Max: 0.004151
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000785 | Grad Max: 0.197391
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013721 | Grad Max: 1.099622
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003207
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002800 | Grad Max: 0.020416
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000155
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000624 | Grad Max: 0.002599
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000068
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000178 | Grad Max: 0.000803
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000429 | Grad Max: 0.001092
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003586 | Grad Max: 0.003586
[GRADIENT NORM TOTAL] 3.0923

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.629
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5429262  0.45707378] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.064
[MASKS] A(Pass/Fail): 667/1381 | B: 578/1278 | C: 308/1068
[LOSS Ex1] A: 0.64772 | B: 0.63480 | C: 0.63120
[LOGITS Ex2 A] Mean Abs: 2.031 | Max: 6.584
[LOSS Ex2] A: 0.13131 | B: 0.33552 | C: 0.25513
** [JOINT LOSS] ** : 0.878561
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003778 | Grad Max: 0.156519
  -> Layer: shared_layers.0.bias | Grad Mean: 0.119198 | Grad Max: 0.575449
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002090 | Grad Max: 0.005656
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001120 | Grad Max: 0.001120
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000994 | Grad Max: 0.167494
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016780 | Grad Max: 0.931581
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.004372
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005228 | Grad Max: 0.035128
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000220
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001227 | Grad Max: 0.004077
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000289 | Grad Max: 0.001169
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000348 | Grad Max: 0.001153
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004470 | Grad Max: 0.004470
[GRADIENT NORM TOTAL] 2.9734

[EPOCH SUMMARY] Train Loss: 0.8806

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8638 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8650 -> New: 0.8638)

############################## EPOCH 119/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.792
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7401366  0.25986338] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.068
[MASKS] A(Pass/Fail): 726/1322 | B: 620/1428 | C: 460/1588
[LOSS Ex1] A: 0.64200 | B: 0.63391 | C: 0.63130
[LOGITS Ex2 A] Mean Abs: 2.081 | Max: 6.505
[LOSS Ex2] A: 0.12977 | B: 0.35641 | C: 0.25631
** [JOINT LOSS] ** : 0.883234
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002855 | Grad Max: 0.093757
  -> Layer: shared_layers.0.bias | Grad Mean: 0.098171 | Grad Max: 0.418892
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.005765
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001128 | Grad Max: 0.001128
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000812 | Grad Max: 0.173357
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013714 | Grad Max: 0.977061
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.004021
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004009 | Grad Max: 0.030381
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000194
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000969 | Grad Max: 0.003301
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000092
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000253 | Grad Max: 0.001114
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000282 | Grad Max: 0.001117
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004306 | Grad Max: 0.004306
[GRADIENT NORM TOTAL] 2.8156

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.880
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50075 0.49925] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.067
[MASKS] A(Pass/Fail): 709/1339 | B: 613/1435 | C: 484/1564
[LOSS Ex1] A: 0.64841 | B: 0.63463 | C: 0.63095
[LOGITS Ex2 A] Mean Abs: 2.108 | Max: 6.293
[LOSS Ex2] A: 0.11849 | B: 0.35320 | C: 0.27064
** [JOINT LOSS] ** : 0.885440
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003382 | Grad Max: 0.138313
  -> Layer: shared_layers.0.bias | Grad Mean: 0.333505 | Grad Max: 1.616851
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.005810
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002531 | Grad Max: 0.002531
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002237 | Grad Max: 0.319077
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040859 | Grad Max: 1.754198
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.013062
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019659 | Grad Max: 0.124852
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000390
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003956 | Grad Max: 0.008741
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000184
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001034 | Grad Max: 0.002384
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000790 | Grad Max: 0.002020
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018850 | Grad Max: 0.018850
[GRADIENT NORM TOTAL] 7.2545

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.608
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.69003665 0.30996335] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.068
[MASKS] A(Pass/Fail): 694/1354 | B: 615/1433 | C: 460/1588
[LOSS Ex1] A: 0.64442 | B: 0.63033 | C: 0.63360
[LOGITS Ex2 A] Mean Abs: 2.086 | Max: 6.844
[LOSS Ex2] A: 0.14404 | B: 0.34545 | C: 0.28264
** [JOINT LOSS] ** : 0.893496
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002753 | Grad Max: 0.064510
  -> Layer: shared_layers.0.bias | Grad Mean: 0.182465 | Grad Max: 0.707772
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.005721
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001224 | Grad Max: 0.001224
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001390 | Grad Max: 0.165764
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024897 | Grad Max: 0.896910
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000177 | Grad Max: 0.007745
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011955 | Grad Max: 0.070349
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000298
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002415 | Grad Max: 0.005928
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000137
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000609 | Grad Max: 0.001889
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000427 | Grad Max: 0.001097
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009946 | Grad Max: 0.009946
[GRADIENT NORM TOTAL] 4.0795

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.708
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6167099 0.3832901] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.068
[MASKS] A(Pass/Fail): 586/1030 | B: 578/1278 | C: 486/1562
[LOSS Ex1] A: 0.64272 | B: 0.63458 | C: 0.62600
[LOGITS Ex2 A] Mean Abs: 2.098 | Max: 7.227
[LOSS Ex2] A: 0.11618 | B: 0.35130 | C: 0.25887
** [JOINT LOSS] ** : 0.876552
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005784 | Grad Max: 0.153037
  -> Layer: shared_layers.0.bias | Grad Mean: 0.401699 | Grad Max: 1.675073
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.006042
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003441 | Grad Max: 0.003441
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002456 | Grad Max: 0.385146
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045217 | Grad Max: 2.153286
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000346 | Grad Max: 0.013510
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023356 | Grad Max: 0.124863
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000462
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004957 | Grad Max: 0.010328
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000240
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001301 | Grad Max: 0.003315
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001106 | Grad Max: 0.002358
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024243 | Grad Max: 0.024243
[GRADIENT NORM TOTAL] 8.0991

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.881
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073905  0.49260947] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.069
[MASKS] A(Pass/Fail): 707/1341 | B: 620/1428 | C: 487/1561
[LOSS Ex1] A: 0.64330 | B: 0.63371 | C: 0.63196
[LOGITS Ex2 A] Mean Abs: 2.072 | Max: 7.718
[LOSS Ex2] A: 0.12607 | B: 0.37390 | C: 0.25519
** [JOINT LOSS] ** : 0.888041
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004887 | Grad Max: 0.158186
  -> Layer: shared_layers.0.bias | Grad Mean: 0.493235 | Grad Max: 2.098393
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002140 | Grad Max: 0.006153
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000051 | Grad Max: 0.000051
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002989 | Grad Max: 0.501698
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055909 | Grad Max: 2.814673
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000421 | Grad Max: 0.015660
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028751 | Grad Max: 0.162421
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000505
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005925 | Grad Max: 0.011934
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000268
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001549 | Grad Max: 0.003518
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001335 | Grad Max: 0.002616
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029788 | Grad Max: 0.029788
[GRADIENT NORM TOTAL] 10.0876

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.830
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105035  0.48949653] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.068
[MASKS] A(Pass/Fail): 695/1353 | B: 613/1435 | C: 496/1552
[LOSS Ex1] A: 0.64034 | B: 0.63443 | C: 0.62559
[LOGITS Ex2 A] Mean Abs: 2.083 | Max: 5.864
[LOSS Ex2] A: 0.13229 | B: 0.35066 | C: 0.24295
** [JOINT LOSS] ** : 0.875423
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003773 | Grad Max: 0.137755
  -> Layer: shared_layers.0.bias | Grad Mean: 0.088209 | Grad Max: 0.576490
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.006405
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002432 | Grad Max: 0.002432
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000838 | Grad Max: 0.108461
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013554 | Grad Max: 0.584640
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.003706
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002549 | Grad Max: 0.031951
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000166
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000290 | Grad Max: 0.001803
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000061
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000074 | Grad Max: 0.000403
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000302 | Grad Max: 0.000794
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000250 | Grad Max: 0.000250
[GRADIENT NORM TOTAL] 2.2569

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.856
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041989  0.49580115] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.068
[MASKS] A(Pass/Fail): 699/1349 | B: 616/1432 | C: 473/1575
[LOSS Ex1] A: 0.63933 | B: 0.63013 | C: 0.62865
[LOGITS Ex2 A] Mean Abs: 2.089 | Max: 6.991
[LOSS Ex2] A: 0.14932 | B: 0.33467 | C: 0.25047
** [JOINT LOSS] ** : 0.877522
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007799 | Grad Max: 0.258655
  -> Layer: shared_layers.0.bias | Grad Mean: 0.451859 | Grad Max: 1.987425
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002318 | Grad Max: 0.006324
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001687 | Grad Max: 0.001687
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003118 | Grad Max: 0.369707
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057066 | Grad Max: 2.104752
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000429 | Grad Max: 0.013432
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028482 | Grad Max: 0.150955
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000558
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006050 | Grad Max: 0.012223
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000286
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001565 | Grad Max: 0.003841
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001322 | Grad Max: 0.002803
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028933 | Grad Max: 0.028933
[GRADIENT NORM TOTAL] 9.6278

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.710
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50246924 0.4975308 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.067
[MASKS] A(Pass/Fail): 675/1373 | B: 577/1279 | C: 455/1593
[LOSS Ex1] A: 0.64745 | B: 0.63439 | C: 0.62772
[LOGITS Ex2 A] Mean Abs: 2.073 | Max: 6.222
[LOSS Ex2] A: 0.12009 | B: 0.33114 | C: 0.25040
** [JOINT LOSS] ** : 0.870401
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004943 | Grad Max: 0.124111
  -> Layer: shared_layers.0.bias | Grad Mean: 0.317073 | Grad Max: 1.506788
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.006086
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000525 | Grad Max: 0.000525
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.317223
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039538 | Grad Max: 1.786360
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000293 | Grad Max: 0.011063
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019730 | Grad Max: 0.115256
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000363
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004123 | Grad Max: 0.008950
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000192
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001077 | Grad Max: 0.002411
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000861 | Grad Max: 0.002406
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019524 | Grad Max: 0.019524
[GRADIENT NORM TOTAL] 6.9299

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.633
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54301065 0.45698932] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.551 | Std: 0.065
[MASKS] A(Pass/Fail): 672/1376 | B: 621/1427 | C: 466/1582
[LOSS Ex1] A: 0.64747 | B: 0.63352 | C: 0.62658
[LOGITS Ex2 A] Mean Abs: 2.003 | Max: 5.979
[LOSS Ex2] A: 0.13408 | B: 0.35564 | C: 0.24058
** [JOINT LOSS] ** : 0.879294
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004970 | Grad Max: 0.159923
  -> Layer: shared_layers.0.bias | Grad Mean: 0.259203 | Grad Max: 1.244950
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.005927
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004473 | Grad Max: 0.004473
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001693 | Grad Max: 0.309774
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030595 | Grad Max: 1.744408
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000230 | Grad Max: 0.007562
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015296 | Grad Max: 0.074676
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000321
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003302 | Grad Max: 0.007543
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000207
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000852 | Grad Max: 0.002415
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000688 | Grad Max: 0.001909
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015251 | Grad Max: 0.015251
[GRADIENT NORM TOTAL] 5.3860

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.798
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7416169  0.25838315] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.068
[MASKS] A(Pass/Fail): 727/1321 | B: 616/1432 | C: 480/1568
[LOSS Ex1] A: 0.64173 | B: 0.63425 | C: 0.62545
[LOGITS Ex2 A] Mean Abs: 2.062 | Max: 6.161
[LOSS Ex2] A: 0.12674 | B: 0.35867 | C: 0.24695
** [JOINT LOSS] ** : 0.877927
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003813 | Grad Max: 0.088999
  -> Layer: shared_layers.0.bias | Grad Mean: 0.223648 | Grad Max: 1.194478
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.006242
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004686 | Grad Max: 0.004686
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001392 | Grad Max: 0.298440
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025679 | Grad Max: 1.669621
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.006691
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012463 | Grad Max: 0.071729
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000288
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002644 | Grad Max: 0.006347
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000155
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000690 | Grad Max: 0.001841
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000610 | Grad Max: 0.001788
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013198 | Grad Max: 0.013198
[GRADIENT NORM TOTAL] 4.7492

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.885
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50080824 0.49919173] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.067
[MASKS] A(Pass/Fail): 709/1339 | B: 616/1432 | C: 494/1554
[LOSS Ex1] A: 0.64817 | B: 0.62994 | C: 0.62606
[LOGITS Ex2 A] Mean Abs: 2.083 | Max: 6.063
[LOSS Ex2] A: 0.12174 | B: 0.33989 | C: 0.27540
** [JOINT LOSS] ** : 0.880400
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003556 | Grad Max: 0.107292
  -> Layer: shared_layers.0.bias | Grad Mean: 0.256378 | Grad Max: 1.260667
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002200 | Grad Max: 0.005633
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000722 | Grad Max: 0.000722
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001819 | Grad Max: 0.308335
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032356 | Grad Max: 1.724711
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.008540
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015010 | Grad Max: 0.099568
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000314
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002920 | Grad Max: 0.007341
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000133
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000752 | Grad Max: 0.001791
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000523 | Grad Max: 0.001538
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013181 | Grad Max: 0.013181
[GRADIENT NORM TOTAL] 5.8702

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.613
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6910062 0.3089938] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.068
[MASKS] A(Pass/Fail): 694/1354 | B: 577/1279 | C: 475/1573
[LOSS Ex1] A: 0.64417 | B: 0.63420 | C: 0.62812
[LOGITS Ex2 A] Mean Abs: 2.066 | Max: 6.347
[LOSS Ex2] A: 0.14015 | B: 0.32922 | C: 0.27711
** [JOINT LOSS] ** : 0.884326
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003317 | Grad Max: 0.092843
  -> Layer: shared_layers.0.bias | Grad Mean: 0.144288 | Grad Max: 0.583152
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.005954
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004854 | Grad Max: 0.004854
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001103 | Grad Max: 0.191674
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018812 | Grad Max: 1.032359
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000096 | Grad Max: 0.004733
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005835 | Grad Max: 0.044342
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000187
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001085 | Grad Max: 0.003963
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000273 | Grad Max: 0.001052
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000203 | Grad Max: 0.000813
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004374 | Grad Max: 0.004374
[GRADIENT NORM TOTAL] 3.2137

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.713
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61721504 0.382785  ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.068
[MASKS] A(Pass/Fail): 586/1030 | B: 623/1425 | C: 472/1576
[LOSS Ex1] A: 0.64246 | B: 0.63335 | C: 0.62832
[LOGITS Ex2 A] Mean Abs: 2.084 | Max: 8.261
[LOSS Ex2] A: 0.12908 | B: 0.36206 | C: 0.24894
** [JOINT LOSS] ** : 0.881402
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004346 | Grad Max: 0.099637
  -> Layer: shared_layers.0.bias | Grad Mean: 0.264310 | Grad Max: 1.333746
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.006060
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003614 | Grad Max: 0.003614
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001789 | Grad Max: 0.363612
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032547 | Grad Max: 2.043206
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.009036
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014838 | Grad Max: 0.087668
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000365
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003171 | Grad Max: 0.006945
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000155
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000842 | Grad Max: 0.002040
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000754 | Grad Max: 0.001812
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016603 | Grad Max: 0.016603
[GRADIENT NORM TOTAL] 6.1324

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.887
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074149 0.4925851] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.069
[MASKS] A(Pass/Fail): 707/1341 | B: 616/1432 | C: 323/1053
[LOSS Ex1] A: 0.64304 | B: 0.63407 | C: 0.62456
[LOGITS Ex2 A] Mean Abs: 2.091 | Max: 6.523
[LOSS Ex2] A: 0.11661 | B: 0.35810 | C: 0.23919
** [JOINT LOSS] ** : 0.871858
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002306 | Grad Max: 0.064889
  -> Layer: shared_layers.0.bias | Grad Mean: 0.095471 | Grad Max: 0.494144
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.005559
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000356 | Grad Max: 0.000356
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000785 | Grad Max: 0.276631
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013588 | Grad Max: 1.513432
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.004671
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005460 | Grad Max: 0.044764
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000210
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001074 | Grad Max: 0.003736
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000104
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000293 | Grad Max: 0.000894
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000366 | Grad Max: 0.001366
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006139 | Grad Max: 0.006139
[GRADIENT NORM TOTAL] 2.8484

[EPOCH SUMMARY] Train Loss: 0.8804

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8631 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8638 -> New: 0.8631)

############################## EPOCH 120/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.836
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51043075 0.48956928] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.068
[MASKS] A(Pass/Fail): 695/1353 | B: 617/1431 | C: 486/1562
[LOSS Ex1] A: 0.64007 | B: 0.62977 | C: 0.62807
[LOGITS Ex2 A] Mean Abs: 2.098 | Max: 5.670
[LOSS Ex2] A: 0.14046 | B: 0.32778 | C: 0.25866
** [JOINT LOSS] ** : 0.874939
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008478 | Grad Max: 0.264695
  -> Layer: shared_layers.0.bias | Grad Mean: 0.398476 | Grad Max: 1.763213
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002385 | Grad Max: 0.006270
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007877 | Grad Max: 0.007877
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002679 | Grad Max: 0.329524
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049034 | Grad Max: 1.816707
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000375 | Grad Max: 0.012778
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024841 | Grad Max: 0.143888
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000577
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005373 | Grad Max: 0.011460
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000279
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001406 | Grad Max: 0.003260
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001226 | Grad Max: 0.002631
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026453 | Grad Max: 0.026453
[GRADIENT NORM TOTAL] 7.9447

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.862
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5042654  0.49573457] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.068
[MASKS] A(Pass/Fail): 699/1349 | B: 579/1277 | C: 481/1567
[LOSS Ex1] A: 0.63906 | B: 0.63403 | C: 0.62651
[LOGITS Ex2 A] Mean Abs: 2.073 | Max: 8.092
[LOSS Ex2] A: 0.15024 | B: 0.33400 | C: 0.25283
** [JOINT LOSS] ** : 0.878892
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006604 | Grad Max: 0.283503
  -> Layer: shared_layers.0.bias | Grad Mean: 0.149921 | Grad Max: 0.474290
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.006598
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005939 | Grad Max: 0.005939
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001267 | Grad Max: 0.248300
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021279 | Grad Max: 1.401527
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000135 | Grad Max: 0.005334
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008107 | Grad Max: 0.042681
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000300
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001949 | Grad Max: 0.005346
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000119
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000493 | Grad Max: 0.001319
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000455 | Grad Max: 0.001347
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008507 | Grad Max: 0.008507
[GRADIENT NORM TOTAL] 4.0005

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.714
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5024284  0.49757156] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.552 | Std: 0.067
[MASKS] A(Pass/Fail): 675/1373 | B: 625/1423 | C: 487/1561
[LOSS Ex1] A: 0.64720 | B: 0.63318 | C: 0.62444
[LOGITS Ex2 A] Mean Abs: 2.017 | Max: 6.230
[LOSS Ex2] A: 0.11906 | B: 0.36403 | C: 0.24395
** [JOINT LOSS] ** : 0.877288
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002449 | Grad Max: 0.122677
  -> Layer: shared_layers.0.bias | Grad Mean: 0.349526 | Grad Max: 1.602035
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.006514
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010257 | Grad Max: 0.010257
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.379170
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040987 | Grad Max: 2.140888
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000270 | Grad Max: 0.011813
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018749 | Grad Max: 0.108510
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000382
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003853 | Grad Max: 0.008358
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000172
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001007 | Grad Max: 0.002446
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000779 | Grad Max: 0.002000
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018274 | Grad Max: 0.018274
[GRADIENT NORM TOTAL] 7.8395

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.638
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54305553 0.45694444] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.065
[MASKS] A(Pass/Fail): 672/1376 | B: 616/1432 | C: 528/1520
[LOSS Ex1] A: 0.64723 | B: 0.63390 | C: 0.62230
[LOGITS Ex2 A] Mean Abs: 2.019 | Max: 6.153
[LOSS Ex2] A: 0.12692 | B: 0.35637 | C: 0.24861
** [JOINT LOSS] ** : 0.878444
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006412 | Grad Max: 0.172999
  -> Layer: shared_layers.0.bias | Grad Mean: 0.428342 | Grad Max: 1.639748
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.006931
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010590 | Grad Max: 0.010590
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002705 | Grad Max: 0.385128
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049725 | Grad Max: 2.174340
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000370 | Grad Max: 0.015013
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024852 | Grad Max: 0.135707
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000471
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005261 | Grad Max: 0.011194
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000246
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001350 | Grad Max: 0.003198
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001109 | Grad Max: 0.002484
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024715 | Grad Max: 0.024715
[GRADIENT NORM TOTAL] 8.4212

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.803
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.74309295 0.256907  ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.069
[MASKS] A(Pass/Fail): 727/1321 | B: 618/1430 | C: 447/1601
[LOSS Ex1] A: 0.64147 | B: 0.62960 | C: 0.63280
[LOGITS Ex2 A] Mean Abs: 2.080 | Max: 5.952
[LOSS Ex2] A: 0.13019 | B: 0.33283 | C: 0.26167
** [JOINT LOSS] ** : 0.876187
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002775 | Grad Max: 0.116137
  -> Layer: shared_layers.0.bias | Grad Mean: 0.234785 | Grad Max: 1.170393
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002334 | Grad Max: 0.006250
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011066 | Grad Max: 0.011066
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001597 | Grad Max: 0.347664
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029161 | Grad Max: 1.944103
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000203 | Grad Max: 0.009121
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013646 | Grad Max: 0.091329
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000299
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002665 | Grad Max: 0.006309
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000127
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000714 | Grad Max: 0.001769
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000537 | Grad Max: 0.001706
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013262 | Grad Max: 0.013262
[GRADIENT NORM TOTAL] 5.5168

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.891
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50075966 0.4992403 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.068
[MASKS] A(Pass/Fail): 709/1339 | B: 579/1277 | C: 489/1559
[LOSS Ex1] A: 0.64793 | B: 0.63387 | C: 0.63203
[LOGITS Ex2 A] Mean Abs: 2.084 | Max: 6.093
[LOSS Ex2] A: 0.12078 | B: 0.33537 | C: 0.26488
** [JOINT LOSS] ** : 0.878285
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003246 | Grad Max: 0.135111
  -> Layer: shared_layers.0.bias | Grad Mean: 0.117706 | Grad Max: 0.648996
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005491
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003245 | Grad Max: 0.003245
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001034 | Grad Max: 0.182968
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017422 | Grad Max: 0.991508
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000089 | Grad Max: 0.005017
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004871 | Grad Max: 0.046235
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000189
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000744 | Grad Max: 0.002847
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000065
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000201 | Grad Max: 0.000845
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000328 | Grad Max: 0.000998
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003931 | Grad Max: 0.003931
[GRADIENT NORM TOTAL] 3.1265

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.617
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6919519 0.3080481] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.069
[MASKS] A(Pass/Fail): 696/1352 | B: 626/1422 | C: 473/1575
[LOSS Ex1] A: 0.64391 | B: 0.63302 | C: 0.62667
[LOGITS Ex2 A] Mean Abs: 2.054 | Max: 6.194
[LOSS Ex2] A: 0.14269 | B: 0.35484 | C: 0.24807
** [JOINT LOSS] ** : 0.883063
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004645 | Grad Max: 0.148820
  -> Layer: shared_layers.0.bias | Grad Mean: 0.127531 | Grad Max: 0.470890
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006440
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001294 | Grad Max: 0.001294
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001007 | Grad Max: 0.203931
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017177 | Grad Max: 0.971015
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000102 | Grad Max: 0.003457
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006157 | Grad Max: 0.030798
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000241
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001465 | Grad Max: 0.004648
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000109
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000384 | Grad Max: 0.001164
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000367 | Grad Max: 0.001231
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007268 | Grad Max: 0.007268
[GRADIENT NORM TOTAL] 3.2332

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.717
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61772144 0.3822785 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.069
[MASKS] A(Pass/Fail): 586/1030 | B: 616/1432 | C: 478/1570
[LOSS Ex1] A: 0.64219 | B: 0.63373 | C: 0.62673
[LOGITS Ex2 A] Mean Abs: 2.107 | Max: 6.670
[LOSS Ex2] A: 0.13135 | B: 0.35213 | C: 0.24580
** [JOINT LOSS] ** : 0.877308
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.043146
  -> Layer: shared_layers.0.bias | Grad Mean: 0.146076 | Grad Max: 0.575778
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.006582
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008559 | Grad Max: 0.008559
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000775 | Grad Max: 0.471884
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013578 | Grad Max: 2.632607
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.002975
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002039 | Grad Max: 0.022539
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000122
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000336 | Grad Max: 0.001992
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000054
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000092 | Grad Max: 0.000501
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000336 | Grad Max: 0.000956
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002260 | Grad Max: 0.002260
[GRADIENT NORM TOTAL] 4.3978

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.893
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50743437 0.49256563] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.069
[MASKS] A(Pass/Fail): 709/1339 | B: 618/1430 | C: 464/1584
[LOSS Ex1] A: 0.64278 | B: 0.62942 | C: 0.63090
[LOGITS Ex2 A] Mean Abs: 2.088 | Max: 8.077
[LOSS Ex2] A: 0.13150 | B: 0.33166 | C: 0.26512
** [JOINT LOSS] ** : 0.877127
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004503 | Grad Max: 0.155450
  -> Layer: shared_layers.0.bias | Grad Mean: 0.256991 | Grad Max: 0.843948
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002209 | Grad Max: 0.005952
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001341 | Grad Max: 0.001341
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001764 | Grad Max: 0.217039
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031723 | Grad Max: 1.214077
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000230 | Grad Max: 0.008917
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015539 | Grad Max: 0.087151
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000450
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003397 | Grad Max: 0.007455
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000151
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000888 | Grad Max: 0.001967
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000674 | Grad Max: 0.001873
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015806 | Grad Max: 0.015806
[GRADIENT NORM TOTAL] 5.1664

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.841
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104343  0.48956567] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.069
[MASKS] A(Pass/Fail): 697/1351 | B: 579/1277 | C: 474/1574
[LOSS Ex1] A: 0.63980 | B: 0.63369 | C: 0.62711
[LOGITS Ex2 A] Mean Abs: 2.074 | Max: 6.454
[LOSS Ex2] A: 0.12123 | B: 0.34160 | C: 0.25242
** [JOINT LOSS] ** : 0.871946
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004138 | Grad Max: 0.170396
  -> Layer: shared_layers.0.bias | Grad Mean: 0.191491 | Grad Max: 0.877641
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002330 | Grad Max: 0.006655
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007025 | Grad Max: 0.007025
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001335 | Grad Max: 0.194814
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022613 | Grad Max: 1.076523
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.006968
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007847 | Grad Max: 0.072500
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000227
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001379 | Grad Max: 0.004616
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000113
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000315 | Grad Max: 0.001171
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000296 | Grad Max: 0.000946
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004551 | Grad Max: 0.004551
[GRADIENT NORM TOTAL] 3.8714

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.868
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5043274  0.49567255] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.069
[MASKS] A(Pass/Fail): 700/1348 | B: 626/1422 | C: 498/1550
[LOSS Ex1] A: 0.63879 | B: 0.63283 | C: 0.62547
[LOGITS Ex2 A] Mean Abs: 2.043 | Max: 7.116
[LOSS Ex2] A: 0.14195 | B: 0.36034 | C: 0.24892
** [JOINT LOSS] ** : 0.882763
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004016 | Grad Max: 0.158473
  -> Layer: shared_layers.0.bias | Grad Mean: 0.127116 | Grad Max: 0.678799
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002283 | Grad Max: 0.006414
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001121 | Grad Max: 0.001121
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001085 | Grad Max: 0.244348
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017465 | Grad Max: 1.327662
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000096 | Grad Max: 0.005590
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005093 | Grad Max: 0.048640
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000139
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000836 | Grad Max: 0.003095
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000060
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000239 | Grad Max: 0.000757
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000278 | Grad Max: 0.000962
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005312 | Grad Max: 0.005312
[GRADIENT NORM TOTAL] 3.2307

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.719
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50237817 0.49762183] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.068
[MASKS] A(Pass/Fail): 676/1372 | B: 616/1432 | C: 459/1589
[LOSS Ex1] A: 0.64696 | B: 0.63353 | C: 0.63174
[LOGITS Ex2 A] Mean Abs: 2.041 | Max: 6.198
[LOSS Ex2] A: 0.12250 | B: 0.36110 | C: 0.27295
** [JOINT LOSS] ** : 0.889593
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002716 | Grad Max: 0.079838
  -> Layer: shared_layers.0.bias | Grad Mean: 0.191605 | Grad Max: 1.069549
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.005717
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006342 | Grad Max: 0.006342
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001285 | Grad Max: 0.307196
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023280 | Grad Max: 1.713290
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000120 | Grad Max: 0.004470
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008062 | Grad Max: 0.047424
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000251
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001632 | Grad Max: 0.004640
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000104
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000392 | Grad Max: 0.001277
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000304 | Grad Max: 0.001105
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005329 | Grad Max: 0.005329
[GRADIENT NORM TOTAL] 4.8120

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.642
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.543193   0.45680702] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.066
[MASKS] A(Pass/Fail): 673/1375 | B: 618/1430 | C: 456/1592
[LOSS Ex1] A: 0.64700 | B: 0.62922 | C: 0.62734
[LOGITS Ex2 A] Mean Abs: 2.035 | Max: 5.564
[LOSS Ex2] A: 0.13415 | B: 0.34160 | C: 0.24084
** [JOINT LOSS] ** : 0.873384
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003194 | Grad Max: 0.105914
  -> Layer: shared_layers.0.bias | Grad Mean: 0.193911 | Grad Max: 1.098228
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.005599
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000426 | Grad Max: 0.000426
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001272 | Grad Max: 0.288037
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022418 | Grad Max: 1.587873
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000119 | Grad Max: 0.007176
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007774 | Grad Max: 0.063793
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000190
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001492 | Grad Max: 0.004299
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000098
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000402 | Grad Max: 0.001214
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000415 | Grad Max: 0.001421
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008062 | Grad Max: 0.008062
[GRADIENT NORM TOTAL] 4.6026

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.808
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7445778  0.25542217] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.069
[MASKS] A(Pass/Fail): 729/1319 | B: 579/1277 | C: 344/1032
[LOSS Ex1] A: 0.64121 | B: 0.63348 | C: 0.62427
[LOGITS Ex2 A] Mean Abs: 2.049 | Max: 5.983
[LOSS Ex2] A: 0.12161 | B: 0.33995 | C: 0.24560
** [JOINT LOSS] ** : 0.868710
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004463 | Grad Max: 0.182794
  -> Layer: shared_layers.0.bias | Grad Mean: 0.222274 | Grad Max: 1.159581
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.006732
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000747 | Grad Max: 0.000747
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001593 | Grad Max: 0.238079
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028571 | Grad Max: 1.318134
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000215 | Grad Max: 0.008380
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014780 | Grad Max: 0.071591
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000383
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003213 | Grad Max: 0.008563
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000216
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000804 | Grad Max: 0.002431
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000605 | Grad Max: 0.001910
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013454 | Grad Max: 0.013454
[GRADIENT NORM TOTAL] 4.6020

[EPOCH SUMMARY] Train Loss: 0.8777

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8622 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8631 -> New: 0.8622)

############################## EPOCH 121/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.897
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008039 0.4991961] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.068
[MASKS] A(Pass/Fail): 711/1337 | B: 627/1421 | C: 456/1592
[LOSS Ex1] A: 0.64769 | B: 0.63263 | C: 0.62842
[LOGITS Ex2 A] Mean Abs: 2.089 | Max: 5.938
[LOSS Ex2] A: 0.12267 | B: 0.35958 | C: 0.25337
** [JOINT LOSS] ** : 0.881454
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003515 | Grad Max: 0.152677
  -> Layer: shared_layers.0.bias | Grad Mean: 0.077133 | Grad Max: 0.349109
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.005348
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001857 | Grad Max: 0.001857
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000820 | Grad Max: 0.172910
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012959 | Grad Max: 0.958847
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000067 | Grad Max: 0.004450
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002939 | Grad Max: 0.026045
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000151
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000348 | Grad Max: 0.002173
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000044
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000085 | Grad Max: 0.000410
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000300 | Grad Max: 0.000827
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000042 | Grad Max: 0.000042
[GRADIENT NORM TOTAL] 2.3633

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.622
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.692925   0.30707502] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.069
[MASKS] A(Pass/Fail): 697/1351 | B: 618/1430 | C: 508/1540
[LOSS Ex1] A: 0.64365 | B: 0.63333 | C: 0.62534
[LOGITS Ex2 A] Mean Abs: 2.070 | Max: 6.406
[LOSS Ex2] A: 0.13778 | B: 0.34885 | C: 0.25382
** [JOINT LOSS] ** : 0.880926
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003686 | Grad Max: 0.114242
  -> Layer: shared_layers.0.bias | Grad Mean: 0.091699 | Grad Max: 0.572788
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.006093
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000487 | Grad Max: 0.000487
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000766 | Grad Max: 0.181400
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012078 | Grad Max: 0.993890
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.003065
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002044 | Grad Max: 0.024008
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000147
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000289 | Grad Max: 0.002017
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000057
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000079 | Grad Max: 0.000516
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000284 | Grad Max: 0.000817
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000301 | Grad Max: 0.000301
[GRADIENT NORM TOTAL] 2.4761

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.723
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61823475 0.38176525] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.069
[MASKS] A(Pass/Fail): 586/1030 | B: 618/1430 | C: 495/1553
[LOSS Ex1] A: 0.64194 | B: 0.62901 | C: 0.63010
[LOGITS Ex2 A] Mean Abs: 2.125 | Max: 6.780
[LOSS Ex2] A: 0.12402 | B: 0.33173 | C: 0.28387
** [JOINT LOSS] ** : 0.880225
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.048039
  -> Layer: shared_layers.0.bias | Grad Mean: 0.127705 | Grad Max: 0.633375
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.006693
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010404 | Grad Max: 0.010404
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000710 | Grad Max: 0.493295
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012438 | Grad Max: 2.752285
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.003786
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002171 | Grad Max: 0.025876
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000132
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000371 | Grad Max: 0.002291
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000062
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000103 | Grad Max: 0.000497
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000248 | Grad Max: 0.000845
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003183 | Grad Max: 0.003183
[GRADIENT NORM TOTAL] 4.1310

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.899
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50739926 0.49260068] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.070
[MASKS] A(Pass/Fail): 709/1339 | B: 580/1276 | C: 510/1538
[LOSS Ex1] A: 0.64252 | B: 0.63328 | C: 0.62595
[LOGITS Ex2 A] Mean Abs: 2.097 | Max: 8.081
[LOSS Ex2] A: 0.12721 | B: 0.33251 | C: 0.24867
** [JOINT LOSS] ** : 0.870046
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007179 | Grad Max: 0.291873
  -> Layer: shared_layers.0.bias | Grad Mean: 0.135588 | Grad Max: 0.567032
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.005989
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003770 | Grad Max: 0.003770
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001194 | Grad Max: 0.281633
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017981 | Grad Max: 1.505011
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.004241
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003687 | Grad Max: 0.031781
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001029 | Grad Max: 0.003495
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000089
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000273 | Grad Max: 0.000908
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000401 | Grad Max: 0.001333
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004918 | Grad Max: 0.004918
[GRADIENT NORM TOTAL] 3.4334

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.847
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51037085 0.4896292 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.069
[MASKS] A(Pass/Fail): 697/1351 | B: 627/1421 | C: 482/1566
[LOSS Ex1] A: 0.63953 | B: 0.63242 | C: 0.62928
[LOGITS Ex2 A] Mean Abs: 2.098 | Max: 6.822
[LOSS Ex2] A: 0.12521 | B: 0.35849 | C: 0.23218
** [JOINT LOSS] ** : 0.872370
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004891 | Grad Max: 0.178203
  -> Layer: shared_layers.0.bias | Grad Mean: 0.105337 | Grad Max: 0.534317
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.006067
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002884 | Grad Max: 0.002884
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000957 | Grad Max: 0.370578
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015092 | Grad Max: 2.007571
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.003234
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002971 | Grad Max: 0.021518
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000164
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000771 | Grad Max: 0.003052
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000077
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000216 | Grad Max: 0.000846
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000318 | Grad Max: 0.001172
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004527 | Grad Max: 0.004527
[GRADIENT NORM TOTAL] 3.3896

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.874
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50440955 0.49559048] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.069
[MASKS] A(Pass/Fail): 701/1347 | B: 619/1429 | C: 493/1555
[LOSS Ex1] A: 0.63851 | B: 0.63312 | C: 0.62783
[LOGITS Ex2 A] Mean Abs: 2.071 | Max: 8.782
[LOSS Ex2] A: 0.13734 | B: 0.35894 | C: 0.25800
** [JOINT LOSS] ** : 0.884582
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003604 | Grad Max: 0.142161
  -> Layer: shared_layers.0.bias | Grad Mean: 0.172221 | Grad Max: 1.045993
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.006581
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000460 | Grad Max: 0.000460
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001326 | Grad Max: 0.383385
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022468 | Grad Max: 2.089798
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.005961
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007567 | Grad Max: 0.054869
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000198
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001300 | Grad Max: 0.003832
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000076
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000352 | Grad Max: 0.001050
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000339 | Grad Max: 0.001053
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008103 | Grad Max: 0.008103
[GRADIENT NORM TOTAL] 4.4648

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.724
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023249  0.49767512] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.068
[MASKS] A(Pass/Fail): 676/1372 | B: 618/1430 | C: 459/1589
[LOSS Ex1] A: 0.64670 | B: 0.62880 | C: 0.63139
[LOGITS Ex2 A] Mean Abs: 2.047 | Max: 6.026
[LOSS Ex2] A: 0.11891 | B: 0.33641 | C: 0.26520
** [JOINT LOSS] ** : 0.875803
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002457 | Grad Max: 0.086046
  -> Layer: shared_layers.0.bias | Grad Mean: 0.087280 | Grad Max: 0.441032
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.005433
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002732 | Grad Max: 0.002732
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000711 | Grad Max: 0.176802
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012112 | Grad Max: 0.969255
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.003088
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001919 | Grad Max: 0.022003
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000167
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000342 | Grad Max: 0.002196
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000051
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000098 | Grad Max: 0.000553
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.000889
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001993 | Grad Max: 0.001993
[GRADIENT NORM TOTAL] 2.5814

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.647
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5432552  0.45674482] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.066
[MASKS] A(Pass/Fail): 673/1375 | B: 580/1276 | C: 493/1555
[LOSS Ex1] A: 0.64675 | B: 0.63306 | C: 0.62621
[LOGITS Ex2 A] Mean Abs: 2.040 | Max: 6.443
[LOSS Ex2] A: 0.13101 | B: 0.33014 | C: 0.24129
** [JOINT LOSS] ** : 0.869486
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005360 | Grad Max: 0.180258
  -> Layer: shared_layers.0.bias | Grad Mean: 0.178665 | Grad Max: 0.949077
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005986
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002751 | Grad Max: 0.002751
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001113 | Grad Max: 0.310187
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018160 | Grad Max: 1.724575
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.005772
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004070 | Grad Max: 0.055985
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000153
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000550 | Grad Max: 0.002608
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000152 | Grad Max: 0.000639
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000354 | Grad Max: 0.000882
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003076 | Grad Max: 0.003076
[GRADIENT NORM TOTAL] 4.0079

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.814
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.74609053 0.2539095 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.069
[MASKS] A(Pass/Fail): 729/1319 | B: 628/1420 | C: 535/1513
[LOSS Ex1] A: 0.64094 | B: 0.63220 | C: 0.62126
[LOGITS Ex2 A] Mean Abs: 2.101 | Max: 6.481
[LOSS Ex2] A: 0.12183 | B: 0.35276 | C: 0.25216
** [JOINT LOSS] ** : 0.873712
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003257 | Grad Max: 0.109684
  -> Layer: shared_layers.0.bias | Grad Mean: 0.111002 | Grad Max: 0.641648
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002309 | Grad Max: 0.006114
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003537 | Grad Max: 0.003537
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000896 | Grad Max: 0.238026
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014975 | Grad Max: 1.334217
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.003361
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002381 | Grad Max: 0.020947
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000123
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000320 | Grad Max: 0.002122
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000059
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000083 | Grad Max: 0.000499
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000253 | Grad Max: 0.000853
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001378 | Grad Max: 0.001378
[GRADIENT NORM TOTAL] 3.0767

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.903
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008179 0.4991821] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.068
[MASKS] A(Pass/Fail): 711/1337 | B: 620/1428 | C: 487/1561
[LOSS Ex1] A: 0.64743 | B: 0.63289 | C: 0.62944
[LOGITS Ex2 A] Mean Abs: 2.088 | Max: 6.071
[LOSS Ex2] A: 0.10855 | B: 0.35026 | C: 0.26059
** [JOINT LOSS] ** : 0.876389
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004235 | Grad Max: 0.175327
  -> Layer: shared_layers.0.bias | Grad Mean: 0.105022 | Grad Max: 0.510246
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.005939
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008261 | Grad Max: 0.008261
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000955 | Grad Max: 0.103472
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016060 | Grad Max: 0.564838
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000104 | Grad Max: 0.004084
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006374 | Grad Max: 0.040665
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000236
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001516 | Grad Max: 0.004196
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000135
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000401 | Grad Max: 0.001489
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000460 | Grad Max: 0.001678
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008641 | Grad Max: 0.008641
[GRADIENT NORM TOTAL] 2.4671

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.627
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6939144 0.3060856] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.069
[MASKS] A(Pass/Fail): 698/1350 | B: 619/1429 | C: 508/1540
[LOSS Ex1] A: 0.64337 | B: 0.62857 | C: 0.62636
[LOGITS Ex2 A] Mean Abs: 2.108 | Max: 6.636
[LOSS Ex2] A: 0.13688 | B: 0.33619 | C: 0.23864
** [JOINT LOSS] ** : 0.869999
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003850 | Grad Max: 0.104151
  -> Layer: shared_layers.0.bias | Grad Mean: 0.253774 | Grad Max: 1.275019
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.006679
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003326 | Grad Max: 0.003326
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001810 | Grad Max: 0.294395
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032869 | Grad Max: 1.648871
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000231 | Grad Max: 0.008029
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015892 | Grad Max: 0.084083
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000346
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003337 | Grad Max: 0.007754
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000164
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000863 | Grad Max: 0.001980
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000696 | Grad Max: 0.001829
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015998 | Grad Max: 0.015998
[GRADIENT NORM TOTAL] 5.8204

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.729
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6188088  0.38119116] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.070
[MASKS] A(Pass/Fail): 586/1030 | B: 581/1275 | C: 495/1553
[LOSS Ex1] A: 0.64164 | B: 0.63283 | C: 0.62599
[LOGITS Ex2 A] Mean Abs: 2.139 | Max: 7.786
[LOSS Ex2] A: 0.13577 | B: 0.33007 | C: 0.26752
** [JOINT LOSS] ** : 0.877941
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006751 | Grad Max: 0.233167
  -> Layer: shared_layers.0.bias | Grad Mean: 0.226361 | Grad Max: 1.062162
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.006244
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007070 | Grad Max: 0.007070
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001336 | Grad Max: 0.451878
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021853 | Grad Max: 2.532056
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000080 | Grad Max: 0.004218
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002943 | Grad Max: 0.038898
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000163
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000323 | Grad Max: 0.002095
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000054
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000089 | Grad Max: 0.000430
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000205 | Grad Max: 0.000666
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001720 | Grad Max: 0.001720
[GRADIENT NORM TOTAL] 5.5118

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.905
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073731 0.4926269] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.070
[MASKS] A(Pass/Fail): 710/1338 | B: 628/1420 | C: 505/1543
[LOSS Ex1] A: 0.64222 | B: 0.63198 | C: 0.62436
[LOGITS Ex2 A] Mean Abs: 2.105 | Max: 6.986
[LOSS Ex2] A: 0.12624 | B: 0.36330 | C: 0.24343
** [JOINT LOSS] ** : 0.877180
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007448 | Grad Max: 0.350605
  -> Layer: shared_layers.0.bias | Grad Mean: 0.263550 | Grad Max: 1.436710
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.006376
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001408 | Grad Max: 0.001408
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001701 | Grad Max: 0.464097
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026800 | Grad Max: 2.592620
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.004808
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004256 | Grad Max: 0.040722
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000205
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000339 | Grad Max: 0.002594
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000054
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000747
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000249 | Grad Max: 0.000688
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000720 | Grad Max: 0.000720
[GRADIENT NORM TOTAL] 6.0786

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.854
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51040477 0.4895953 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.070
[MASKS] A(Pass/Fail): 700/1348 | B: 621/1427 | C: 338/1038
[LOSS Ex1] A: 0.63922 | B: 0.63268 | C: 0.62584
[LOGITS Ex2 A] Mean Abs: 2.105 | Max: 5.953
[LOSS Ex2] A: 0.12920 | B: 0.35551 | C: 0.24918
** [JOINT LOSS] ** : 0.877207
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006247 | Grad Max: 0.208648
  -> Layer: shared_layers.0.bias | Grad Mean: 0.143624 | Grad Max: 0.651515
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002270 | Grad Max: 0.006306
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003106 | Grad Max: 0.003106
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001097 | Grad Max: 0.205648
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017566 | Grad Max: 1.110768
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.004900
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002732 | Grad Max: 0.031145
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000202
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000560 | Grad Max: 0.002600
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000071
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000137 | Grad Max: 0.000677
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000222 | Grad Max: 0.000822
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001329 | Grad Max: 0.001329
[GRADIENT NORM TOTAL] 3.5470

[EPOCH SUMMARY] Train Loss: 0.8762

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8557 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8622 -> New: 0.8557)

############################## EPOCH 122/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.880
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044451  0.49555492] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.069
[MASKS] A(Pass/Fail): 707/1341 | B: 620/1428 | C: 476/1572
[LOSS Ex1] A: 0.63821 | B: 0.62835 | C: 0.62662
[LOGITS Ex2 A] Mean Abs: 2.093 | Max: 8.025
[LOSS Ex2] A: 0.14657 | B: 0.33962 | C: 0.25757
** [JOINT LOSS] ** : 0.878973
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003577 | Grad Max: 0.155775
  -> Layer: shared_layers.0.bias | Grad Mean: 0.091846 | Grad Max: 0.542717
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002352 | Grad Max: 0.006579
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005138 | Grad Max: 0.005138
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000815 | Grad Max: 0.168174
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012864 | Grad Max: 0.936435
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.003258
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002214 | Grad Max: 0.018235
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000141
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000452 | Grad Max: 0.002587
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000073
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000111 | Grad Max: 0.000712
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000201 | Grad Max: 0.000712
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001175 | Grad Max: 0.001175
[GRADIENT NORM TOTAL] 2.7660

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.730
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.502211   0.49778903] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.068
[MASKS] A(Pass/Fail): 677/1371 | B: 582/1274 | C: 504/1544
[LOSS Ex1] A: 0.64642 | B: 0.63260 | C: 0.62708
[LOGITS Ex2 A] Mean Abs: 2.054 | Max: 6.278
[LOSS Ex2] A: 0.12621 | B: 0.33034 | C: 0.23966
** [JOINT LOSS] ** : 0.867443
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004948 | Grad Max: 0.152426
  -> Layer: shared_layers.0.bias | Grad Mean: 0.190900 | Grad Max: 0.736228
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.006209
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009360 | Grad Max: 0.009360
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001386 | Grad Max: 0.159915
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024926 | Grad Max: 0.901235
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000220 | Grad Max: 0.008336
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014679 | Grad Max: 0.086289
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000349
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003057 | Grad Max: 0.006997
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000164
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000770 | Grad Max: 0.001890
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000678 | Grad Max: 0.001813
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014142 | Grad Max: 0.014142
[GRADIENT NORM TOTAL] 3.7501

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.652
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54346454 0.45653546] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.552 | Std: 0.066
[MASKS] A(Pass/Fail): 675/1373 | B: 628/1420 | C: 529/1519
[LOSS Ex1] A: 0.64648 | B: 0.63175 | C: 0.62398
[LOGITS Ex2 A] Mean Abs: 2.076 | Max: 6.143
[LOSS Ex2] A: 0.13277 | B: 0.35972 | C: 0.26620
** [JOINT LOSS] ** : 0.886970
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006414 | Grad Max: 0.234569
  -> Layer: shared_layers.0.bias | Grad Mean: 0.471798 | Grad Max: 2.751504
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.006000
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005606 | Grad Max: 0.005606
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003094 | Grad Max: 0.519786
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054949 | Grad Max: 2.914001
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000352 | Grad Max: 0.014151
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023932 | Grad Max: 0.134738
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000472
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004487 | Grad Max: 0.009997
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000193
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001117 | Grad Max: 0.002778
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000769 | Grad Max: 0.001631
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019069 | Grad Max: 0.019069
[GRADIENT NORM TOTAL] 10.4999

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.821
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.74784565 0.25215435] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.070
[MASKS] A(Pass/Fail): 734/1314 | B: 622/1426 | C: 481/1567
[LOSS Ex1] A: 0.64063 | B: 0.63244 | C: 0.62822
[LOGITS Ex2 A] Mean Abs: 2.144 | Max: 6.435
[LOSS Ex2] A: 0.12343 | B: 0.35946 | C: 0.26124
** [JOINT LOSS] ** : 0.881806
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006980 | Grad Max: 0.287712
  -> Layer: shared_layers.0.bias | Grad Mean: 0.652681 | Grad Max: 3.664814
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.005870
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002004 | Grad Max: 0.002004
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004197 | Grad Max: 0.758110
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.076140 | Grad Max: 4.233461
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000528 | Grad Max: 0.021259
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036708 | Grad Max: 0.217007
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000666
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007141 | Grad Max: 0.015364
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000273
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001818 | Grad Max: 0.004113
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001276 | Grad Max: 0.002622
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032126 | Grad Max: 0.032126
[GRADIENT NORM TOTAL] 14.7990

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.910
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50087845 0.49912155] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.069
[MASKS] A(Pass/Fail): 714/1334 | B: 620/1428 | C: 526/1522
[LOSS Ex1] A: 0.64716 | B: 0.62812 | C: 0.62432
[LOGITS Ex2 A] Mean Abs: 2.128 | Max: 6.510
[LOSS Ex2] A: 0.12999 | B: 0.33592 | C: 0.25602
** [JOINT LOSS] ** : 0.873841
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008748 | Grad Max: 0.345622
  -> Layer: shared_layers.0.bias | Grad Mean: 0.306400 | Grad Max: 1.591846
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.005844
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002576 | Grad Max: 0.002576
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002367 | Grad Max: 0.339146
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039468 | Grad Max: 1.822804
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000234 | Grad Max: 0.010036
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013881 | Grad Max: 0.106129
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000288
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002247 | Grad Max: 0.005549
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000094
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000570 | Grad Max: 0.001540
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000351 | Grad Max: 0.001138
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010209 | Grad Max: 0.010209
[GRADIENT NORM TOTAL] 6.8339

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.632
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.69510126 0.30489874] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.070
[MASKS] A(Pass/Fail): 699/1349 | B: 582/1274 | C: 465/1583
[LOSS Ex1] A: 0.64308 | B: 0.63239 | C: 0.63131
[LOGITS Ex2 A] Mean Abs: 2.068 | Max: 6.274
[LOSS Ex2] A: 0.15943 | B: 0.34324 | C: 0.28592
** [JOINT LOSS] ** : 0.898462
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012530 | Grad Max: 0.406141
  -> Layer: shared_layers.0.bias | Grad Mean: 0.409739 | Grad Max: 1.587740
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.006072
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002394 | Grad Max: 0.002394
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002847 | Grad Max: 0.372091
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050587 | Grad Max: 1.769511
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000375 | Grad Max: 0.009869
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024464 | Grad Max: 0.118294
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000519
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005391 | Grad Max: 0.010918
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000264
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001380 | Grad Max: 0.003400
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001216 | Grad Max: 0.002335
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025481 | Grad Max: 0.025481
[GRADIENT NORM TOTAL] 8.1052

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.734
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6195589 0.3804411] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.070
[MASKS] A(Pass/Fail): 587/1029 | B: 628/1420 | C: 499/1549
[LOSS Ex1] A: 0.64137 | B: 0.63156 | C: 0.62751
[LOGITS Ex2 A] Mean Abs: 2.100 | Max: 5.786
[LOSS Ex2] A: 0.13632 | B: 0.36087 | C: 0.23489
** [JOINT LOSS] ** : 0.877505
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009231 | Grad Max: 0.266429
  -> Layer: shared_layers.0.bias | Grad Mean: 0.569539 | Grad Max: 2.728072
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.005943
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007341 | Grad Max: 0.007341
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003701 | Grad Max: 0.757311
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.067626 | Grad Max: 4.220363
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.016199
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033013 | Grad Max: 0.162735
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000695
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007010 | Grad Max: 0.015206
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000309
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001792 | Grad Max: 0.004012
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001463 | Grad Max: 0.002907
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032551 | Grad Max: 0.032551
[GRADIENT NORM TOTAL] 12.5894

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.912
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50735277 0.49264726] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.070
[MASKS] A(Pass/Fail): 710/1338 | B: 622/1426 | C: 486/1562
[LOSS Ex1] A: 0.64195 | B: 0.63226 | C: 0.62844
[LOGITS Ex2 A] Mean Abs: 2.102 | Max: 6.230
[LOSS Ex2] A: 0.12411 | B: 0.35712 | C: 0.25642
** [JOINT LOSS] ** : 0.880101
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003183 | Grad Max: 0.142045
  -> Layer: shared_layers.0.bias | Grad Mean: 0.194765 | Grad Max: 1.193328
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.005724
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002833 | Grad Max: 0.002833
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001049 | Grad Max: 0.480596
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018508 | Grad Max: 2.684771
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.003823
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004118 | Grad Max: 0.035289
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000162
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000875 | Grad Max: 0.003126
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000084
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000225 | Grad Max: 0.000765
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000286 | Grad Max: 0.000992
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005127 | Grad Max: 0.005127
[GRADIENT NORM TOTAL] 5.2780

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.860
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104603 0.4895397] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.070
[MASKS] A(Pass/Fail): 701/1347 | B: 620/1428 | C: 545/1503
[LOSS Ex1] A: 0.63894 | B: 0.62794 | C: 0.62328
[LOGITS Ex2 A] Mean Abs: 2.118 | Max: 5.257
[LOSS Ex2] A: 0.13577 | B: 0.33392 | C: 0.25485
** [JOINT LOSS] ** : 0.871568
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009009 | Grad Max: 0.260619
  -> Layer: shared_layers.0.bias | Grad Mean: 0.442500 | Grad Max: 1.785108
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002365 | Grad Max: 0.006204
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003951 | Grad Max: 0.003951
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002966 | Grad Max: 0.359177
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054570 | Grad Max: 2.025843
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.012322
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028252 | Grad Max: 0.140751
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000509
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005953 | Grad Max: 0.012398
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000260
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001523 | Grad Max: 0.003578
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001252 | Grad Max: 0.002604
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027949 | Grad Max: 0.027949
[GRADIENT NORM TOTAL] 8.9807

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.886
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044989  0.49550113] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.070
[MASKS] A(Pass/Fail): 707/1341 | B: 583/1273 | C: 517/1531
[LOSS Ex1] A: 0.63794 | B: 0.63223 | C: 0.62633
[LOGITS Ex2 A] Mean Abs: 2.115 | Max: 7.086
[LOSS Ex2] A: 0.16548 | B: 0.32726 | C: 0.25586
** [JOINT LOSS] ** : 0.881702
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012144 | Grad Max: 0.451277
  -> Layer: shared_layers.0.bias | Grad Mean: 0.427327 | Grad Max: 1.566839
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002275 | Grad Max: 0.006461
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004446 | Grad Max: 0.004446
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003157 | Grad Max: 0.421112
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056340 | Grad Max: 2.318303
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000434 | Grad Max: 0.012662
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028823 | Grad Max: 0.145800
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000631
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006375 | Grad Max: 0.012688
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000282
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001657 | Grad Max: 0.003828
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001400 | Grad Max: 0.002724
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030502 | Grad Max: 0.030502
[GRADIENT NORM TOTAL] 8.8494

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.734
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5021688  0.49783123] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.553 | Std: 0.069
[MASKS] A(Pass/Fail): 677/1371 | B: 628/1420 | C: 510/1538
[LOSS Ex1] A: 0.64618 | B: 0.63140 | C: 0.62644
[LOGITS Ex2 A] Mean Abs: 2.044 | Max: 5.863
[LOSS Ex2] A: 0.12408 | B: 0.36399 | C: 0.24446
** [JOINT LOSS] ** : 0.878850
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005145 | Grad Max: 0.147396
  -> Layer: shared_layers.0.bias | Grad Mean: 0.316003 | Grad Max: 1.451213
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006787
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011833 | Grad Max: 0.011833
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002060 | Grad Max: 0.354299
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036451 | Grad Max: 1.817282
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000242 | Grad Max: 0.010677
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016447 | Grad Max: 0.101143
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000305
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003111 | Grad Max: 0.007228
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000152
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000781 | Grad Max: 0.002156
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000576 | Grad Max: 0.001474
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014082 | Grad Max: 0.014082
[GRADIENT NORM TOTAL] 6.7071

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.656
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54345006 0.45654994] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.067
[MASKS] A(Pass/Fail): 676/1372 | B: 622/1426 | C: 506/1542
[LOSS Ex1] A: 0.64627 | B: 0.63210 | C: 0.62381
[LOGITS Ex2 A] Mean Abs: 2.015 | Max: 5.694
[LOSS Ex2] A: 0.12089 | B: 0.35878 | C: 0.24198
** [JOINT LOSS] ** : 0.874610
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003822 | Grad Max: 0.116021
  -> Layer: shared_layers.0.bias | Grad Mean: 0.355347 | Grad Max: 1.603620
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.006161
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007425 | Grad Max: 0.007425
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002307 | Grad Max: 0.297535
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041942 | Grad Max: 1.610927
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000306 | Grad Max: 0.013593
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021396 | Grad Max: 0.142835
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000428
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004153 | Grad Max: 0.009685
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000195
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001053 | Grad Max: 0.002793
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000778 | Grad Max: 0.001969
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019200 | Grad Max: 0.019200
[GRADIENT NORM TOTAL] 7.4537

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.826
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.74913806 0.2508619 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.070
[MASKS] A(Pass/Fail): 734/1314 | B: 620/1428 | C: 490/1558
[LOSS Ex1] A: 0.64040 | B: 0.62779 | C: 0.62480
[LOGITS Ex2 A] Mean Abs: 2.056 | Max: 6.427
[LOSS Ex2] A: 0.13189 | B: 0.32827 | C: 0.22165
** [JOINT LOSS] ** : 0.858267
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004995 | Grad Max: 0.177715
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141133 | Grad Max: 0.620464
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002274 | Grad Max: 0.006497
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003084 | Grad Max: 0.003084
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001021 | Grad Max: 0.233575
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017479 | Grad Max: 1.293205
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000101 | Grad Max: 0.004072
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006277 | Grad Max: 0.032463
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000271
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001482 | Grad Max: 0.003838
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000101
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000389 | Grad Max: 0.001288
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000485 | Grad Max: 0.001635
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007227 | Grad Max: 0.007227
[GRADIENT NORM TOTAL] 3.5032

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.916
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50088435 0.49911568] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.069
[MASKS] A(Pass/Fail): 714/1334 | B: 585/1271 | C: 337/1039
[LOSS Ex1] A: 0.64695 | B: 0.63208 | C: 0.62610
[LOGITS Ex2 A] Mean Abs: 2.110 | Max: 5.622
[LOSS Ex2] A: 0.11535 | B: 0.33046 | C: 0.24643
** [JOINT LOSS] ** : 0.865785
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002224 | Grad Max: 0.064773
  -> Layer: shared_layers.0.bias | Grad Mean: 0.082621 | Grad Max: 0.362353
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.005954
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002376 | Grad Max: 0.002376
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000757 | Grad Max: 0.177102
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012734 | Grad Max: 0.985454
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.004027
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002769 | Grad Max: 0.039263
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000129
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000425 | Grad Max: 0.002458
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000063
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000112 | Grad Max: 0.000633
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000396 | Grad Max: 0.001002
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001031 | Grad Max: 0.001031
[GRADIENT NORM TOTAL] 2.6441

[EPOCH SUMMARY] Train Loss: 0.8768

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8560 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 123/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.636
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6959383 0.3040617] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.070
[MASKS] A(Pass/Fail): 699/1349 | B: 630/1418 | C: 465/1583
[LOSS Ex1] A: 0.64286 | B: 0.63124 | C: 0.63426
[LOGITS Ex2 A] Mean Abs: 2.066 | Max: 5.971
[LOSS Ex2] A: 0.13945 | B: 0.34966 | C: 0.25886
** [JOINT LOSS] ** : 0.885447
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005402 | Grad Max: 0.181821
  -> Layer: shared_layers.0.bias | Grad Mean: 0.133374 | Grad Max: 0.546803
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002115 | Grad Max: 0.005768
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003018 | Grad Max: 0.003018
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001040 | Grad Max: 0.276625
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017308 | Grad Max: 1.557226
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.003898
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004622 | Grad Max: 0.028213
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000234
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001142 | Grad Max: 0.003711
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000094
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000280 | Grad Max: 0.000875
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000299 | Grad Max: 0.001030
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004670 | Grad Max: 0.004670
[GRADIENT NORM TOTAL] 3.5881

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.738
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.61995983 0.38004014] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.070
[MASKS] A(Pass/Fail): 587/1029 | B: 623/1425 | C: 494/1554
[LOSS Ex1] A: 0.64114 | B: 0.63194 | C: 0.62995
[LOGITS Ex2 A] Mean Abs: 2.105 | Max: 7.198
[LOSS Ex2] A: 0.13473 | B: 0.35121 | C: 0.27584
** [JOINT LOSS] ** : 0.888274
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004082 | Grad Max: 0.156711
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141589 | Grad Max: 0.676188
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002107 | Grad Max: 0.005873
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007306 | Grad Max: 0.007306
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001135 | Grad Max: 0.175157
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019213 | Grad Max: 0.921149
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000071 | Grad Max: 0.004276
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003465 | Grad Max: 0.040708
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000135
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000489 | Grad Max: 0.002604
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000060
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000114 | Grad Max: 0.000660
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000175 | Grad Max: 0.000758
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000570 | Grad Max: 0.000570
[GRADIENT NORM TOTAL] 3.4806

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.917
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50732154 0.49267846] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.071
[MASKS] A(Pass/Fail): 710/1338 | B: 620/1428 | C: 511/1537
[LOSS Ex1] A: 0.64172 | B: 0.62762 | C: 0.62433
[LOGITS Ex2 A] Mean Abs: 2.105 | Max: 7.395
[LOSS Ex2] A: 0.12090 | B: 0.33169 | C: 0.25955
** [JOINT LOSS] ** : 0.868601
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.069470
  -> Layer: shared_layers.0.bias | Grad Mean: 0.088703 | Grad Max: 0.449160
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002273 | Grad Max: 0.005729
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003009 | Grad Max: 0.003009
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000736 | Grad Max: 0.175656
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012853 | Grad Max: 0.980993
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.003241
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002506 | Grad Max: 0.020880
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000129
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000387 | Grad Max: 0.002705
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000054
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000099 | Grad Max: 0.000610
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000334 | Grad Max: 0.000917
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000648 | Grad Max: 0.000648
[GRADIENT NORM TOTAL] 2.6399

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.864
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104711  0.48952892] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.070
[MASKS] A(Pass/Fail): 701/1347 | B: 585/1271 | C: 487/1561
[LOSS Ex1] A: 0.63870 | B: 0.63190 | C: 0.62747
[LOGITS Ex2 A] Mean Abs: 2.082 | Max: 5.476
[LOSS Ex2] A: 0.12882 | B: 0.33535 | C: 0.23720
** [JOINT LOSS] ** : 0.866478
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003427 | Grad Max: 0.107804
  -> Layer: shared_layers.0.bias | Grad Mean: 0.291572 | Grad Max: 1.520487
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.006023
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000911 | Grad Max: 0.000911
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001770 | Grad Max: 0.447997
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031559 | Grad Max: 2.488886
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000195 | Grad Max: 0.008757
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013626 | Grad Max: 0.097625
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000308
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002666 | Grad Max: 0.006349
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000140
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000677 | Grad Max: 0.001764
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000480 | Grad Max: 0.001560
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011816 | Grad Max: 0.011816
[GRADIENT NORM TOTAL] 6.4368

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.892
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5045406  0.49545938] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.070
[MASKS] A(Pass/Fail): 707/1341 | B: 630/1418 | C: 530/1518
[LOSS Ex1] A: 0.63769 | B: 0.63107 | C: 0.62439
[LOGITS Ex2 A] Mean Abs: 2.064 | Max: 7.341
[LOSS Ex2] A: 0.14402 | B: 0.35644 | C: 0.24531
** [JOINT LOSS] ** : 0.879644
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005917 | Grad Max: 0.265172
  -> Layer: shared_layers.0.bias | Grad Mean: 0.104220 | Grad Max: 0.385255
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002228 | Grad Max: 0.006668
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001808 | Grad Max: 0.001808
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001016 | Grad Max: 0.165561
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015378 | Grad Max: 0.913546
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000083 | Grad Max: 0.004294
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004067 | Grad Max: 0.027065
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000214
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001040 | Grad Max: 0.003549
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000265 | Grad Max: 0.000944
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000253 | Grad Max: 0.001050
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004307 | Grad Max: 0.004307
[GRADIENT NORM TOTAL] 2.8222

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.738
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5021162 0.4978838] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.069
[MASKS] A(Pass/Fail): 677/1371 | B: 623/1425 | C: 527/1521
[LOSS Ex1] A: 0.64595 | B: 0.63176 | C: 0.62183
[LOGITS Ex2 A] Mean Abs: 2.074 | Max: 5.849
[LOSS Ex2] A: 0.12167 | B: 0.35133 | C: 0.22469
** [JOINT LOSS] ** : 0.865746
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002758 | Grad Max: 0.078278
  -> Layer: shared_layers.0.bias | Grad Mean: 0.125310 | Grad Max: 0.518688
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002142 | Grad Max: 0.005922
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008006 | Grad Max: 0.008006
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000904 | Grad Max: 0.303575
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015348 | Grad Max: 1.713656
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.006260
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004381 | Grad Max: 0.048521
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000216
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000714 | Grad Max: 0.002718
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000077
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000174 | Grad Max: 0.000983
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000427 | Grad Max: 0.001116
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002344 | Grad Max: 0.002344
[GRADIENT NORM TOTAL] 3.4490

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.661
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54355335 0.45644668] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.067
[MASKS] A(Pass/Fail): 677/1371 | B: 620/1428 | C: 526/1522
[LOSS Ex1] A: 0.64604 | B: 0.62743 | C: 0.62529
[LOGITS Ex2 A] Mean Abs: 2.064 | Max: 6.010
[LOSS Ex2] A: 0.13017 | B: 0.33586 | C: 0.24973
** [JOINT LOSS] ** : 0.871506
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003578 | Grad Max: 0.145065
  -> Layer: shared_layers.0.bias | Grad Mean: 0.303104 | Grad Max: 1.743588
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.005428
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003586 | Grad Max: 0.003586
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001956 | Grad Max: 0.319754
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035396 | Grad Max: 1.742353
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.008976
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016008 | Grad Max: 0.106424
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000274
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003176 | Grad Max: 0.007117
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000155
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000809 | Grad Max: 0.001929
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000550 | Grad Max: 0.001705
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013962 | Grad Max: 0.013962
[GRADIENT NORM TOTAL] 6.6944

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.831
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7505793  0.24942073] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.071
[MASKS] A(Pass/Fail): 734/1314 | B: 586/1270 | C: 521/1527
[LOSS Ex1] A: 0.64014 | B: 0.63172 | C: 0.62214
[LOGITS Ex2 A] Mean Abs: 2.095 | Max: 6.538
[LOSS Ex2] A: 0.11337 | B: 0.33081 | C: 0.24392
** [JOINT LOSS] ** : 0.860701
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003211 | Grad Max: 0.104264
  -> Layer: shared_layers.0.bias | Grad Mean: 0.090113 | Grad Max: 0.403602
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002357 | Grad Max: 0.006763
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010784 | Grad Max: 0.010784
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000759 | Grad Max: 0.184355
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012700 | Grad Max: 1.010200
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.002847
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002693 | Grad Max: 0.022632
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000161
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000569 | Grad Max: 0.002811
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000072
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000134 | Grad Max: 0.000853
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000381 | Grad Max: 0.001100
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001337 | Grad Max: 0.001337
[GRADIENT NORM TOTAL] 2.5630

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.921
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50091475 0.49908522] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.069
[MASKS] A(Pass/Fail): 715/1333 | B: 631/1417 | C: 500/1548
[LOSS Ex1] A: 0.64670 | B: 0.63088 | C: 0.62679
[LOGITS Ex2 A] Mean Abs: 2.110 | Max: 6.188
[LOSS Ex2] A: 0.11217 | B: 0.35604 | C: 0.26026
** [JOINT LOSS] ** : 0.877617
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005743 | Grad Max: 0.180796
  -> Layer: shared_layers.0.bias | Grad Mean: 0.245344 | Grad Max: 1.093278
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.006043
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003184 | Grad Max: 0.003184
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001605 | Grad Max: 0.441398
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028263 | Grad Max: 2.489155
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000191 | Grad Max: 0.007240
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012888 | Grad Max: 0.072674
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000337
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002812 | Grad Max: 0.006393
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000146
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000722 | Grad Max: 0.001663
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000588 | Grad Max: 0.001490
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013246 | Grad Max: 0.013246
[GRADIENT NORM TOTAL] 5.5423

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.641
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.696925 0.303075] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.070
[MASKS] A(Pass/Fail): 702/1346 | B: 624/1424 | C: 493/1555
[LOSS Ex1] A: 0.64261 | B: 0.63157 | C: 0.62542
[LOGITS Ex2 A] Mean Abs: 2.111 | Max: 5.845
[LOSS Ex2] A: 0.13268 | B: 0.34457 | C: 0.25610
** [JOINT LOSS] ** : 0.877644
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003445 | Grad Max: 0.092421
  -> Layer: shared_layers.0.bias | Grad Mean: 0.176393 | Grad Max: 0.861026
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.006297
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010013 | Grad Max: 0.010013
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001328 | Grad Max: 0.309706
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023457 | Grad Max: 1.728211
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000114 | Grad Max: 0.007119
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007781 | Grad Max: 0.073615
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000192
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001507 | Grad Max: 0.004217
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000093
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000379 | Grad Max: 0.001108
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000290 | Grad Max: 0.000998
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006003 | Grad Max: 0.006003
[GRADIENT NORM TOTAL] 4.7047

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.744
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62046486 0.3795351 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.071
[MASKS] A(Pass/Fail): 586/1030 | B: 620/1428 | C: 531/1517
[LOSS Ex1] A: 0.64089 | B: 0.62723 | C: 0.62368
[LOGITS Ex2 A] Mean Abs: 2.155 | Max: 7.466
[LOSS Ex2] A: 0.12348 | B: 0.33666 | C: 0.25793
** [JOINT LOSS] ** : 0.869956
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002449 | Grad Max: 0.072513
  -> Layer: shared_layers.0.bias | Grad Mean: 0.171752 | Grad Max: 0.952079
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002240 | Grad Max: 0.006442
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008567 | Grad Max: 0.008567
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001339 | Grad Max: 0.276032
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024022 | Grad Max: 1.543805
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000154 | Grad Max: 0.005688
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010409 | Grad Max: 0.056943
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000216
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002077 | Grad Max: 0.005304
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000116
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000528 | Grad Max: 0.001423
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000367 | Grad Max: 0.001248
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008775 | Grad Max: 0.008775
[GRADIENT NORM TOTAL] 4.4295

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.923
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072711  0.49272895] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.071
[MASKS] A(Pass/Fail): 710/1338 | B: 586/1270 | C: 499/1549
[LOSS Ex1] A: 0.64146 | B: 0.63152 | C: 0.62438
[LOGITS Ex2 A] Mean Abs: 2.114 | Max: 6.614
[LOSS Ex2] A: 0.12049 | B: 0.34083 | C: 0.23087
** [JOINT LOSS] ** : 0.863184
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003243 | Grad Max: 0.109716
  -> Layer: shared_layers.0.bias | Grad Mean: 0.255179 | Grad Max: 1.173665
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.006087
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002790 | Grad Max: 0.002790
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001601 | Grad Max: 0.188105
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029233 | Grad Max: 1.050740
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000235 | Grad Max: 0.010227
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016470 | Grad Max: 0.099716
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000332
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003263 | Grad Max: 0.007748
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000162
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000808 | Grad Max: 0.002340
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000598 | Grad Max: 0.001701
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013893 | Grad Max: 0.013893
[GRADIENT NORM TOTAL] 5.1480

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.870
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51043415 0.48956582] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.071
[MASKS] A(Pass/Fail): 704/1344 | B: 632/1416 | C: 501/1547
[LOSS Ex1] A: 0.63843 | B: 0.63068 | C: 0.62776
[LOGITS Ex2 A] Mean Abs: 2.146 | Max: 6.100
[LOSS Ex2] A: 0.12674 | B: 0.34839 | C: 0.24973
** [JOINT LOSS] ** : 0.873912
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003706 | Grad Max: 0.110021
  -> Layer: shared_layers.0.bias | Grad Mean: 0.110637 | Grad Max: 0.509463
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.006322
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003786 | Grad Max: 0.003786
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000951 | Grad Max: 0.126364
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015888 | Grad Max: 0.700898
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.003651
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003382 | Grad Max: 0.035783
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000128
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000480 | Grad Max: 0.002120
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000055
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000137 | Grad Max: 0.000644
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000224 | Grad Max: 0.000754
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002901 | Grad Max: 0.002901
[GRADIENT NORM TOTAL] 2.7621

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.898
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5045722  0.49542782] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.070
[MASKS] A(Pass/Fail): 708/1340 | B: 624/1424 | C: 358/1018
[LOSS Ex1] A: 0.63743 | B: 0.63137 | C: 0.62064
[LOGITS Ex2 A] Mean Abs: 2.119 | Max: 7.051
[LOSS Ex2] A: 0.15301 | B: 0.35151 | C: 0.25354
** [JOINT LOSS] ** : 0.882497
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007757 | Grad Max: 0.294427
  -> Layer: shared_layers.0.bias | Grad Mean: 0.372485 | Grad Max: 1.545357
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002332 | Grad Max: 0.006461
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002594 | Grad Max: 0.002594
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002587 | Grad Max: 0.423823
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046712 | Grad Max: 2.370435
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000316 | Grad Max: 0.010069
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021422 | Grad Max: 0.101515
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000500
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004616 | Grad Max: 0.010002
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000200
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001157 | Grad Max: 0.002696
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000863 | Grad Max: 0.002045
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019407 | Grad Max: 0.019407
[GRADIENT NORM TOTAL] 8.2167

[EPOCH SUMMARY] Train Loss: 0.8737

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8533 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8557 -> New: 0.8533)

############################## EPOCH 124/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.743
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5020353  0.49796462] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.069
[MASKS] A(Pass/Fail): 677/1371 | B: 620/1428 | C: 521/1527
[LOSS Ex1] A: 0.64571 | B: 0.62703 | C: 0.62260
[LOGITS Ex2 A] Mean Abs: 2.109 | Max: 5.761
[LOSS Ex2] A: 0.11896 | B: 0.33304 | C: 0.24897
** [JOINT LOSS] ** : 0.865433
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003213 | Grad Max: 0.095384
  -> Layer: shared_layers.0.bias | Grad Mean: 0.197964 | Grad Max: 1.129166
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002258 | Grad Max: 0.006498
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009930 | Grad Max: 0.009930
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001268 | Grad Max: 0.269037
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023171 | Grad Max: 1.505456
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000131 | Grad Max: 0.007032
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009145 | Grad Max: 0.060640
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000238
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001919 | Grad Max: 0.004792
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000107
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000478 | Grad Max: 0.001268
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000369 | Grad Max: 0.001255
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007796 | Grad Max: 0.007796
[GRADIENT NORM TOTAL] 4.6896

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.666
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5436094 0.4563906] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.067
[MASKS] A(Pass/Fail): 677/1371 | B: 586/1270 | C: 503/1545
[LOSS Ex1] A: 0.64581 | B: 0.63132 | C: 0.62480
[LOGITS Ex2 A] Mean Abs: 2.048 | Max: 6.740
[LOSS Ex2] A: 0.13380 | B: 0.33615 | C: 0.22927
** [JOINT LOSS] ** : 0.867049
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005318 | Grad Max: 0.143759
  -> Layer: shared_layers.0.bias | Grad Mean: 0.328283 | Grad Max: 1.495407
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.005987
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003032 | Grad Max: 0.003032
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.413912
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040608 | Grad Max: 2.305137
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.009208
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020264 | Grad Max: 0.097102
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000379
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004194 | Grad Max: 0.008922
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000201
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001053 | Grad Max: 0.002596
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000793 | Grad Max: 0.002162
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018195 | Grad Max: 0.018195
[GRADIENT NORM TOTAL] 7.2169

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.835
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75210744 0.24789259] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.071
[MASKS] A(Pass/Fail): 739/1309 | B: 632/1416 | C: 525/1523
[LOSS Ex1] A: 0.63989 | B: 0.63049 | C: 0.62023
[LOGITS Ex2 A] Mean Abs: 2.090 | Max: 6.441
[LOSS Ex2] A: 0.11883 | B: 0.35881 | C: 0.23410
** [JOINT LOSS] ** : 0.867450
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003261 | Grad Max: 0.103649
  -> Layer: shared_layers.0.bias | Grad Mean: 0.260358 | Grad Max: 1.560939
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.006524
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002123 | Grad Max: 0.002123
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001783 | Grad Max: 0.417564
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032486 | Grad Max: 2.346198
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000192 | Grad Max: 0.007225
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013451 | Grad Max: 0.069990
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000274
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002803 | Grad Max: 0.006733
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000149
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000698 | Grad Max: 0.001979
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000529 | Grad Max: 0.001484
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011993 | Grad Max: 0.011993
[GRADIENT NORM TOTAL] 6.4461

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.926
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500956   0.49904406] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.070
[MASKS] A(Pass/Fail): 715/1333 | B: 624/1424 | C: 503/1545
[LOSS Ex1] A: 0.64647 | B: 0.63118 | C: 0.63045
[LOGITS Ex2 A] Mean Abs: 2.124 | Max: 5.785
[LOSS Ex2] A: 0.11241 | B: 0.34847 | C: 0.25057
** [JOINT LOSS] ** : 0.873179
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003798 | Grad Max: 0.122908
  -> Layer: shared_layers.0.bias | Grad Mean: 0.213063 | Grad Max: 0.898049
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.005835
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000682 | Grad Max: 0.000682
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001671 | Grad Max: 0.198822
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029405 | Grad Max: 1.089567
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000186 | Grad Max: 0.011130
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012720 | Grad Max: 0.108514
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000234
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002343 | Grad Max: 0.005577
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000128
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000589 | Grad Max: 0.001702
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000368 | Grad Max: 0.001301
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009635 | Grad Max: 0.009635
[GRADIENT NORM TOTAL] 4.9596

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.646
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6978943 0.3021057] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.071
[MASKS] A(Pass/Fail): 705/1343 | B: 620/1428 | C: 568/1480
[LOSS Ex1] A: 0.64236 | B: 0.62684 | C: 0.62144
[LOGITS Ex2 A] Mean Abs: 2.117 | Max: 6.897
[LOSS Ex2] A: 0.13565 | B: 0.32806 | C: 0.25786
** [JOINT LOSS] ** : 0.870735
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004327 | Grad Max: 0.149064
  -> Layer: shared_layers.0.bias | Grad Mean: 0.239451 | Grad Max: 1.395006
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002257 | Grad Max: 0.005899
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003830 | Grad Max: 0.003830
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001667 | Grad Max: 0.285795
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029019 | Grad Max: 1.593446
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000173 | Grad Max: 0.009446
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011352 | Grad Max: 0.079326
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000232
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001998 | Grad Max: 0.005125
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000095
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000499 | Grad Max: 0.001289
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000316 | Grad Max: 0.001254
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008492 | Grad Max: 0.008492
[GRADIENT NORM TOTAL] 5.3317

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.749
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6209914  0.37900853] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.071
[MASKS] A(Pass/Fail): 587/1029 | B: 586/1270 | C: 488/1560
[LOSS Ex1] A: 0.64064 | B: 0.63113 | C: 0.62888
[LOGITS Ex2 A] Mean Abs: 2.129 | Max: 8.132
[LOSS Ex2] A: 0.12289 | B: 0.33209 | C: 0.26186
** [JOINT LOSS] ** : 0.872498
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004748 | Grad Max: 0.140141
  -> Layer: shared_layers.0.bias | Grad Mean: 0.278821 | Grad Max: 1.013433
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002145 | Grad Max: 0.005547
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004033 | Grad Max: 0.004033
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001795 | Grad Max: 0.237487
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032947 | Grad Max: 1.254378
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000233 | Grad Max: 0.010154
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016402 | Grad Max: 0.113966
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000328
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003503 | Grad Max: 0.007233
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000177
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000916 | Grad Max: 0.002353
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000792 | Grad Max: 0.001767
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017640 | Grad Max: 0.017640
[GRADIENT NORM TOTAL] 5.6574

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.928
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50728065 0.49271935] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.071
[MASKS] A(Pass/Fail): 712/1336 | B: 633/1415 | C: 501/1547
[LOSS Ex1] A: 0.64121 | B: 0.63031 | C: 0.62798
[LOGITS Ex2 A] Mean Abs: 2.140 | Max: 7.668
[LOSS Ex2] A: 0.10833 | B: 0.35221 | C: 0.24264
** [JOINT LOSS] ** : 0.867563
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003413 | Grad Max: 0.114459
  -> Layer: shared_layers.0.bias | Grad Mean: 0.218998 | Grad Max: 1.019021
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006198
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001432 | Grad Max: 0.001432
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001407 | Grad Max: 0.552259
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024828 | Grad Max: 3.082441
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.006743
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009451 | Grad Max: 0.061944
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000219
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001783 | Grad Max: 0.004864
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000079
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000472 | Grad Max: 0.001096
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000419 | Grad Max: 0.001320
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009370 | Grad Max: 0.009370
[GRADIENT NORM TOTAL] 5.7347

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.876
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104191 0.4895809] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.071
[MASKS] A(Pass/Fail): 705/1343 | B: 625/1423 | C: 521/1527
[LOSS Ex1] A: 0.63817 | B: 0.63100 | C: 0.62559
[LOGITS Ex2 A] Mean Abs: 2.145 | Max: 5.701
[LOSS Ex2] A: 0.13931 | B: 0.34621 | C: 0.26974
** [JOINT LOSS] ** : 0.883342
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007947 | Grad Max: 0.210566
  -> Layer: shared_layers.0.bias | Grad Mean: 0.335731 | Grad Max: 1.455301
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.006512
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001296 | Grad Max: 0.001296
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002338 | Grad Max: 0.353433
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042201 | Grad Max: 1.973873
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000311 | Grad Max: 0.010175
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021137 | Grad Max: 0.112899
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000451
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004485 | Grad Max: 0.009621
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000210
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001113 | Grad Max: 0.002811
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000873 | Grad Max: 0.001768
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018891 | Grad Max: 0.018891
[GRADIENT NORM TOTAL] 7.2422

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.903
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5046031 0.4953969] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.071
[MASKS] A(Pass/Fail): 711/1337 | B: 622/1426 | C: 505/1543
[LOSS Ex1] A: 0.63717 | B: 0.62667 | C: 0.62526
[LOGITS Ex2 A] Mean Abs: 2.107 | Max: 8.059
[LOSS Ex2] A: 0.14490 | B: 0.32839 | C: 0.24335
** [JOINT LOSS] ** : 0.868582
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005544 | Grad Max: 0.213998
  -> Layer: shared_layers.0.bias | Grad Mean: 0.208179 | Grad Max: 0.752692
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002330 | Grad Max: 0.006387
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007138 | Grad Max: 0.007138
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001586 | Grad Max: 0.298433
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027542 | Grad Max: 1.636307
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.006502
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012560 | Grad Max: 0.067392
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000340
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002795 | Grad Max: 0.007123
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000139
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000713 | Grad Max: 0.001794
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000540 | Grad Max: 0.001699
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012604 | Grad Max: 0.012604
[GRADIENT NORM TOTAL] 4.8401

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.748
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50198615 0.49801382] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.070
[MASKS] A(Pass/Fail): 677/1371 | B: 586/1270 | C: 535/1513
[LOSS Ex1] A: 0.64548 | B: 0.63097 | C: 0.62326
[LOGITS Ex2 A] Mean Abs: 2.043 | Max: 6.303
[LOSS Ex2] A: 0.11916 | B: 0.34850 | C: 0.25533
** [JOINT LOSS] ** : 0.874235
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003006 | Grad Max: 0.133608
  -> Layer: shared_layers.0.bias | Grad Mean: 0.389931 | Grad Max: 1.798444
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.005575
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003379 | Grad Max: 0.003379
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002571 | Grad Max: 0.446711
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047838 | Grad Max: 2.524558
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000348 | Grad Max: 0.015514
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024998 | Grad Max: 0.165887
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000456
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005128 | Grad Max: 0.010821
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000244
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001322 | Grad Max: 0.003263
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000993 | Grad Max: 0.002162
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023876 | Grad Max: 0.023876
[GRADIENT NORM TOTAL] 8.7055

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.670
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54370445 0.45629558] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.068
[MASKS] A(Pass/Fail): 680/1368 | B: 633/1415 | C: 490/1558
[LOSS Ex1] A: 0.64558 | B: 0.63016 | C: 0.62763
[LOGITS Ex2 A] Mean Abs: 2.024 | Max: 6.150
[LOSS Ex2] A: 0.12314 | B: 0.36452 | C: 0.25796
** [JOINT LOSS] ** : 0.882997
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006419 | Grad Max: 0.154096
  -> Layer: shared_layers.0.bias | Grad Mean: 0.460939 | Grad Max: 2.082536
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.006270
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010657 | Grad Max: 0.010657
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003004 | Grad Max: 0.538694
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055545 | Grad Max: 3.043663
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.012139
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027844 | Grad Max: 0.153649
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000515
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005868 | Grad Max: 0.011519
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000265
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001520 | Grad Max: 0.003608
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001187 | Grad Max: 0.002372
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028199 | Grad Max: 0.028199
[GRADIENT NORM TOTAL] 10.3885

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.841
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75345695 0.24654305] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.071
[MASKS] A(Pass/Fail): 740/1308 | B: 625/1423 | C: 497/1551
[LOSS Ex1] A: 0.63965 | B: 0.63085 | C: 0.62491
[LOGITS Ex2 A] Mean Abs: 2.110 | Max: 6.069
[LOSS Ex2] A: 0.11737 | B: 0.34326 | C: 0.25489
** [JOINT LOSS] ** : 0.870312
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.075573
  -> Layer: shared_layers.0.bias | Grad Mean: 0.179245 | Grad Max: 0.718249
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006431
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004931 | Grad Max: 0.004931
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001319 | Grad Max: 0.334750
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024082 | Grad Max: 1.865856
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000133 | Grad Max: 0.006618
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009524 | Grad Max: 0.052220
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000205
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001870 | Grad Max: 0.005010
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000126
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000462 | Grad Max: 0.001436
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000356 | Grad Max: 0.001204
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007334 | Grad Max: 0.007334
[GRADIENT NORM TOTAL] 4.9534

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.932
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50091046 0.4990895 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.070
[MASKS] A(Pass/Fail): 715/1333 | B: 622/1426 | C: 497/1551
[LOSS Ex1] A: 0.64625 | B: 0.62651 | C: 0.62867
[LOGITS Ex2 A] Mean Abs: 2.120 | Max: 5.736
[LOSS Ex2] A: 0.11387 | B: 0.33017 | C: 0.25859
** [JOINT LOSS] ** : 0.868021
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003536 | Grad Max: 0.159679
  -> Layer: shared_layers.0.bias | Grad Mean: 0.270588 | Grad Max: 1.491322
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.005975
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005942 | Grad Max: 0.005942
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001908 | Grad Max: 0.382657
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034068 | Grad Max: 2.122998
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.008332
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013831 | Grad Max: 0.086482
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000245
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002605 | Grad Max: 0.005962
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000122
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000683 | Grad Max: 0.001804
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000441 | Grad Max: 0.001738
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011356 | Grad Max: 0.011356
[GRADIENT NORM TOTAL] 6.5544

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.650
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.69879067 0.30120933] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.071
[MASKS] A(Pass/Fail): 706/1342 | B: 586/1270 | C: 372/1004
[LOSS Ex1] A: 0.64212 | B: 0.63082 | C: 0.61676
[LOGITS Ex2 A] Mean Abs: 2.087 | Max: 6.312
[LOSS Ex2] A: 0.13423 | B: 0.32664 | C: 0.23054
** [JOINT LOSS] ** : 0.860372
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004170 | Grad Max: 0.131603
  -> Layer: shared_layers.0.bias | Grad Mean: 0.143289 | Grad Max: 0.573071
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002357 | Grad Max: 0.006575
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007931 | Grad Max: 0.007931
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001093 | Grad Max: 0.167660
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018509 | Grad Max: 0.951221
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.005254
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007249 | Grad Max: 0.040660
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000243
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001575 | Grad Max: 0.005048
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000101
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000340 | Grad Max: 0.001205
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000382 | Grad Max: 0.001106
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003981 | Grad Max: 0.003981
[GRADIENT NORM TOTAL] 3.0544

[EPOCH SUMMARY] Train Loss: 0.8708

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8516 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8533 -> New: 0.8516)

############################## EPOCH 125/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.753
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62147266 0.37852737] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.071
[MASKS] A(Pass/Fail): 587/1029 | B: 633/1415 | C: 486/1562
[LOSS Ex1] A: 0.64039 | B: 0.63001 | C: 0.62838
[LOGITS Ex2 A] Mean Abs: 2.127 | Max: 7.139
[LOSS Ex2] A: 0.11752 | B: 0.35266 | C: 0.25929
** [JOINT LOSS] ** : 0.876082
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002465 | Grad Max: 0.067582
  -> Layer: shared_layers.0.bias | Grad Mean: 0.074624 | Grad Max: 0.351940
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.006256
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002766 | Grad Max: 0.002766
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000784 | Grad Max: 0.207510
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013454 | Grad Max: 1.169100
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.003632
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003481 | Grad Max: 0.031046
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000159
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000714 | Grad Max: 0.003120
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000072
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000184 | Grad Max: 0.000862
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000260 | Grad Max: 0.000885
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004068 | Grad Max: 0.004068
[GRADIENT NORM TOTAL] 2.6899

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.934
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072824 0.4927176] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.072
[MASKS] A(Pass/Fail): 712/1336 | B: 625/1423 | C: 519/1529
[LOSS Ex1] A: 0.64098 | B: 0.63069 | C: 0.62820
[LOGITS Ex2 A] Mean Abs: 2.128 | Max: 8.760
[LOSS Ex2] A: 0.11569 | B: 0.34948 | C: 0.25878
** [JOINT LOSS] ** : 0.874604
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004232 | Grad Max: 0.112456
  -> Layer: shared_layers.0.bias | Grad Mean: 0.343642 | Grad Max: 1.252653
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.006144
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006066 | Grad Max: 0.006066
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002402 | Grad Max: 0.353519
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044014 | Grad Max: 1.971685
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000306 | Grad Max: 0.010474
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021725 | Grad Max: 0.114854
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000390
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004493 | Grad Max: 0.009610
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000192
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001125 | Grad Max: 0.002641
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000760 | Grad Max: 0.001944
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018504 | Grad Max: 0.018504
[GRADIENT NORM TOTAL] 7.6088

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.881
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104458  0.48955417] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.071
[MASKS] A(Pass/Fail): 706/1342 | B: 623/1425 | C: 530/1518
[LOSS Ex1] A: 0.63793 | B: 0.62634 | C: 0.61969
[LOGITS Ex2 A] Mean Abs: 2.109 | Max: 6.471
[LOSS Ex2] A: 0.12510 | B: 0.32997 | C: 0.24404
** [JOINT LOSS] ** : 0.861020
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004264 | Grad Max: 0.130782
  -> Layer: shared_layers.0.bias | Grad Mean: 0.190611 | Grad Max: 0.874468
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002397 | Grad Max: 0.006037
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000827 | Grad Max: 0.000827
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.402113
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025690 | Grad Max: 2.249467
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000149 | Grad Max: 0.005464
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010283 | Grad Max: 0.054728
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000243
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002261 | Grad Max: 0.006238
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000121
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000578 | Grad Max: 0.001345
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000415 | Grad Max: 0.001383
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010061 | Grad Max: 0.010061
[GRADIENT NORM TOTAL] 5.0143

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.908
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5046133  0.49538678] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.071
[MASKS] A(Pass/Fail): 712/1336 | B: 586/1270 | C: 520/1528
[LOSS Ex1] A: 0.63693 | B: 0.63064 | C: 0.62270
[LOGITS Ex2 A] Mean Abs: 2.051 | Max: 7.480
[LOSS Ex2] A: 0.14300 | B: 0.34102 | C: 0.25995
** [JOINT LOSS] ** : 0.878078
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005625 | Grad Max: 0.205481
  -> Layer: shared_layers.0.bias | Grad Mean: 0.370863 | Grad Max: 1.484509
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002287 | Grad Max: 0.006186
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001567 | Grad Max: 0.001567
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002376 | Grad Max: 0.267285
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041630 | Grad Max: 1.489181
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000309 | Grad Max: 0.014898
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021253 | Grad Max: 0.153727
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000414
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004003 | Grad Max: 0.008845
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000166
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001045 | Grad Max: 0.002311
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000804 | Grad Max: 0.001786
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019825 | Grad Max: 0.019825
[GRADIENT NORM TOTAL] 7.3174

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.752
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50188524 0.49811473] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.070
[MASKS] A(Pass/Fail): 679/1369 | B: 633/1415 | C: 519/1529
[LOSS Ex1] A: 0.64527 | B: 0.62983 | C: 0.62402
[LOGITS Ex2 A] Mean Abs: 2.029 | Max: 6.684
[LOSS Ex2] A: 0.12482 | B: 0.35018 | C: 0.23992
** [JOINT LOSS] ** : 0.871348
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004044 | Grad Max: 0.106032
  -> Layer: shared_layers.0.bias | Grad Mean: 0.339179 | Grad Max: 1.338906
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.005589
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003366 | Grad Max: 0.003366
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002238 | Grad Max: 0.252346
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041083 | Grad Max: 1.415466
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000288 | Grad Max: 0.012724
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020288 | Grad Max: 0.132517
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000366
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004010 | Grad Max: 0.008714
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000201
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001042 | Grad Max: 0.002658
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000832 | Grad Max: 0.002106
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019279 | Grad Max: 0.019279
[GRADIENT NORM TOTAL] 6.9014

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.675
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54379785 0.45620212] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.553 | Std: 0.068
[MASKS] A(Pass/Fail): 680/1368 | B: 625/1423 | C: 510/1538
[LOSS Ex1] A: 0.64539 | B: 0.63051 | C: 0.62689
[LOGITS Ex2 A] Mean Abs: 2.056 | Max: 5.959
[LOSS Ex2] A: 0.12718 | B: 0.35015 | C: 0.25767
** [JOINT LOSS] ** : 0.879266
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003961 | Grad Max: 0.115911
  -> Layer: shared_layers.0.bias | Grad Mean: 0.260088 | Grad Max: 1.402621
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.006012
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008918 | Grad Max: 0.008918
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001743 | Grad Max: 0.368945
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031025 | Grad Max: 2.048323
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000182 | Grad Max: 0.007288
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012786 | Grad Max: 0.068241
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000289
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002772 | Grad Max: 0.006677
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000146
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000729 | Grad Max: 0.001869
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000596 | Grad Max: 0.001732
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013550 | Grad Max: 0.013550
[GRADIENT NORM TOTAL] 6.1638

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.845
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7548706  0.24512938] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.072
[MASKS] A(Pass/Fail): 741/1307 | B: 623/1425 | C: 514/1534
[LOSS Ex1] A: 0.63942 | B: 0.62617 | C: 0.62419
[LOGITS Ex2 A] Mean Abs: 2.100 | Max: 5.940
[LOSS Ex2] A: 0.11880 | B: 0.33496 | C: 0.23848
** [JOINT LOSS] ** : 0.860676
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004689 | Grad Max: 0.172172
  -> Layer: shared_layers.0.bias | Grad Mean: 0.406955 | Grad Max: 2.203925
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.006439
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002127 | Grad Max: 0.002127
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002539 | Grad Max: 0.505533
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046822 | Grad Max: 2.823731
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000299 | Grad Max: 0.013042
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021387 | Grad Max: 0.125714
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000405
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004379 | Grad Max: 0.009522
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000212
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001110 | Grad Max: 0.002892
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000758 | Grad Max: 0.002232
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019150 | Grad Max: 0.019150
[GRADIENT NORM TOTAL] 9.2780

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.937
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50100857 0.49899143] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.071
[MASKS] A(Pass/Fail): 715/1333 | B: 586/1270 | C: 521/1527
[LOSS Ex1] A: 0.64605 | B: 0.63047 | C: 0.62361
[LOGITS Ex2 A] Mean Abs: 2.093 | Max: 6.294
[LOSS Ex2] A: 0.11150 | B: 0.32901 | C: 0.25909
** [JOINT LOSS] ** : 0.866577
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004715 | Grad Max: 0.140409
  -> Layer: shared_layers.0.bias | Grad Mean: 0.140224 | Grad Max: 0.547403
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.006191
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009237 | Grad Max: 0.009237
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001160 | Grad Max: 0.203874
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020555 | Grad Max: 1.134210
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000125 | Grad Max: 0.005224
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008013 | Grad Max: 0.048595
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000232
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001771 | Grad Max: 0.004468
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000115
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000461 | Grad Max: 0.001276
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000473 | Grad Max: 0.001455
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009326 | Grad Max: 0.009326
[GRADIENT NORM TOTAL] 3.3930

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.655
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.69975126 0.3002488 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.072
[MASKS] A(Pass/Fail): 706/1342 | B: 633/1415 | C: 501/1547
[LOSS Ex1] A: 0.64190 | B: 0.62967 | C: 0.62509
[LOGITS Ex2 A] Mean Abs: 2.104 | Max: 6.942
[LOSS Ex2] A: 0.14430 | B: 0.34875 | C: 0.24886
** [JOINT LOSS] ** : 0.879524
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003016 | Grad Max: 0.081989
  -> Layer: shared_layers.0.bias | Grad Mean: 0.130818 | Grad Max: 0.775195
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.006179
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003924 | Grad Max: 0.003924
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000985 | Grad Max: 0.237168
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016503 | Grad Max: 1.328306
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.004435
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005491 | Grad Max: 0.041808
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000159
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001016 | Grad Max: 0.003212
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000081
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000246 | Grad Max: 0.000933
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000235 | Grad Max: 0.000871
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004548 | Grad Max: 0.004548
[GRADIENT NORM TOTAL] 3.5062

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.758
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62194985 0.37805015] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.072
[MASKS] A(Pass/Fail): 587/1029 | B: 625/1423 | C: 516/1532
[LOSS Ex1] A: 0.64017 | B: 0.63036 | C: 0.62662
[LOGITS Ex2 A] Mean Abs: 2.126 | Max: 6.503
[LOSS Ex2] A: 0.12378 | B: 0.35147 | C: 0.23775
** [JOINT LOSS] ** : 0.870047
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002781 | Grad Max: 0.074229
  -> Layer: shared_layers.0.bias | Grad Mean: 0.189770 | Grad Max: 1.061653
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.006432
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011784 | Grad Max: 0.011784
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001120 | Grad Max: 0.432570
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020216 | Grad Max: 2.426720
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000107 | Grad Max: 0.005542
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007378 | Grad Max: 0.057204
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000196
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001521 | Grad Max: 0.004465
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000109
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000400 | Grad Max: 0.001177
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000348 | Grad Max: 0.001309
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007686 | Grad Max: 0.007686
[GRADIENT NORM TOTAL] 5.0158

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.939
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072333 0.4927667] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.072
[MASKS] A(Pass/Fail): 712/1336 | B: 624/1424 | C: 511/1537
[LOSS Ex1] A: 0.64076 | B: 0.62601 | C: 0.62357
[LOGITS Ex2 A] Mean Abs: 2.121 | Max: 7.405
[LOSS Ex2] A: 0.11858 | B: 0.33344 | C: 0.24514
** [JOINT LOSS] ** : 0.862500
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003053 | Grad Max: 0.078805
  -> Layer: shared_layers.0.bias | Grad Mean: 0.149171 | Grad Max: 0.610070
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002277 | Grad Max: 0.006054
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002359 | Grad Max: 0.002359
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001143 | Grad Max: 0.326546
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020584 | Grad Max: 1.827532
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.007238
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008667 | Grad Max: 0.081118
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000218
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001857 | Grad Max: 0.004997
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000112
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000485 | Grad Max: 0.001367
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000419 | Grad Max: 0.001388
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008177 | Grad Max: 0.008177
[GRADIENT NORM TOTAL] 4.2366

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.886
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51034987 0.48965013] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.072
[MASKS] A(Pass/Fail): 706/1342 | B: 586/1270 | C: 505/1543
[LOSS Ex1] A: 0.63769 | B: 0.63032 | C: 0.62600
[LOGITS Ex2 A] Mean Abs: 2.114 | Max: 5.905
[LOSS Ex2] A: 0.12612 | B: 0.32918 | C: 0.23681
** [JOINT LOSS] ** : 0.862041
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003265 | Grad Max: 0.100234
  -> Layer: shared_layers.0.bias | Grad Mean: 0.264095 | Grad Max: 1.493723
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002301 | Grad Max: 0.006798
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007290 | Grad Max: 0.007290
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001536 | Grad Max: 0.507294
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027702 | Grad Max: 2.814058
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000135 | Grad Max: 0.005512
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009564 | Grad Max: 0.051570
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000214
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001823 | Grad Max: 0.005169
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000103
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000436 | Grad Max: 0.001517
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000351 | Grad Max: 0.001167
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006479 | Grad Max: 0.006479
[GRADIENT NORM TOTAL] 6.5018

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.914
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.504668 0.495332] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.071
[MASKS] A(Pass/Fail): 713/1335 | B: 634/1414 | C: 559/1489
[LOSS Ex1] A: 0.63669 | B: 0.62951 | C: 0.62059
[LOGITS Ex2 A] Mean Abs: 2.090 | Max: 5.775
[LOSS Ex2] A: 0.13576 | B: 0.36304 | C: 0.23714
** [JOINT LOSS] ** : 0.874245
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005839 | Grad Max: 0.196620
  -> Layer: shared_layers.0.bias | Grad Mean: 0.208357 | Grad Max: 0.981326
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006554
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000173 | Grad Max: 0.000173
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001389 | Grad Max: 0.528966
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023039 | Grad Max: 2.941603
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.006180
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004364 | Grad Max: 0.057481
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000154
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000512 | Grad Max: 0.002981
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000058
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000138 | Grad Max: 0.000742
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000211 | Grad Max: 0.000739
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003025 | Grad Max: 0.003025
[GRADIENT NORM TOTAL] 5.4922

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.756
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5018491 0.4981509] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.554 | Std: 0.070
[MASKS] A(Pass/Fail): 679/1369 | B: 625/1423 | C: 361/1015
[LOSS Ex1] A: 0.64505 | B: 0.63019 | C: 0.62346
[LOGITS Ex2 A] Mean Abs: 2.089 | Max: 5.945
[LOSS Ex2] A: 0.12461 | B: 0.34946 | C: 0.26225
** [JOINT LOSS] ** : 0.878339
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003150 | Grad Max: 0.089742
  -> Layer: shared_layers.0.bias | Grad Mean: 0.154896 | Grad Max: 0.672900
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.006165
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008049 | Grad Max: 0.008049
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001116 | Grad Max: 0.428503
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019872 | Grad Max: 2.389894
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.004951
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006258 | Grad Max: 0.050676
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000193
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001324 | Grad Max: 0.004167
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000080
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000328 | Grad Max: 0.001003
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000296 | Grad Max: 0.000976
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004505 | Grad Max: 0.004505
[GRADIENT NORM TOTAL] 4.5428

[EPOCH SUMMARY] Train Loss: 0.8710

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8506 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8516 -> New: 0.8506)

############################## EPOCH 126/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.678
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.543824   0.45617598] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.068
[MASKS] A(Pass/Fail): 680/1368 | B: 625/1423 | C: 511/1537
[LOSS Ex1] A: 0.64518 | B: 0.62584 | C: 0.62627
[LOGITS Ex2 A] Mean Abs: 2.076 | Max: 5.628
[LOSS Ex2] A: 0.12662 | B: 0.32739 | C: 0.25217
** [JOINT LOSS] ** : 0.867825
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001767 | Grad Max: 0.039948
  -> Layer: shared_layers.0.bias | Grad Mean: 0.063048 | Grad Max: 0.290060
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006406
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011287 | Grad Max: 0.011287
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000570 | Grad Max: 0.160826
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.009771 | Grad Max: 0.902944
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.002897
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001862 | Grad Max: 0.023386
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000005 | Grad Max: 0.000133
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000299 | Grad Max: 0.002115
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000055
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000075 | Grad Max: 0.000485
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000351 | Grad Max: 0.001037
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000629 | Grad Max: 0.000629
[GRADIENT NORM TOTAL] 2.0422

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.850
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7560879  0.24391206] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.072
[MASKS] A(Pass/Fail): 741/1307 | B: 586/1270 | C: 498/1550
[LOSS Ex1] A: 0.63919 | B: 0.63015 | C: 0.62538
[LOGITS Ex2 A] Mean Abs: 2.101 | Max: 6.584
[LOSS Ex2] A: 0.11415 | B: 0.33238 | C: 0.25623
** [JOINT LOSS] ** : 0.865828
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.070321
  -> Layer: shared_layers.0.bias | Grad Mean: 0.099042 | Grad Max: 0.458998
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.006236
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002979 | Grad Max: 0.002979
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000734 | Grad Max: 0.247826
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012747 | Grad Max: 1.391934
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002472
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001967 | Grad Max: 0.017625
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000113
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000326 | Grad Max: 0.001897
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000055
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000088 | Grad Max: 0.000517
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000327 | Grad Max: 0.000934
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000496 | Grad Max: 0.000496
[GRADIENT NORM TOTAL] 3.1797

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.943
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009708  0.49902928] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.071
[MASKS] A(Pass/Fail): 716/1332 | B: 638/1410 | C: 494/1554
[LOSS Ex1] A: 0.64582 | B: 0.62934 | C: 0.62709
[LOGITS Ex2 A] Mean Abs: 2.101 | Max: 6.098
[LOSS Ex2] A: 0.11670 | B: 0.34451 | C: 0.24238
** [JOINT LOSS] ** : 0.868612
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003717 | Grad Max: 0.125754
  -> Layer: shared_layers.0.bias | Grad Mean: 0.117779 | Grad Max: 0.432238
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.005537
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003245 | Grad Max: 0.003245
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000999 | Grad Max: 0.227458
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016961 | Grad Max: 1.264416
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.003765
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006020 | Grad Max: 0.034133
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000197
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001428 | Grad Max: 0.004189
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000087
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000378 | Grad Max: 0.001017
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000408 | Grad Max: 0.001257
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006562 | Grad Max: 0.006562
[GRADIENT NORM TOTAL] 3.1551

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.659
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7006436  0.29935646] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.072
[MASKS] A(Pass/Fail): 706/1342 | B: 625/1423 | C: 507/1541
[LOSS Ex1] A: 0.64165 | B: 0.63001 | C: 0.62426
[LOGITS Ex2 A] Mean Abs: 2.112 | Max: 6.045
[LOSS Ex2] A: 0.13463 | B: 0.34435 | C: 0.22637
** [JOINT LOSS] ** : 0.867087
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004127 | Grad Max: 0.143336
  -> Layer: shared_layers.0.bias | Grad Mean: 0.332162 | Grad Max: 1.578519
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.005918
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002676 | Grad Max: 0.002676
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002230 | Grad Max: 0.539825
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040482 | Grad Max: 2.995574
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000268 | Grad Max: 0.010963
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019370 | Grad Max: 0.116074
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000352
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003909 | Grad Max: 0.008385
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000179
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001018 | Grad Max: 0.002656
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000689 | Grad Max: 0.001885
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018551 | Grad Max: 0.018551
[GRADIENT NORM TOTAL] 7.9410

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.763
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6224491  0.37755093] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.072
[MASKS] A(Pass/Fail): 587/1029 | B: 625/1423 | C: 513/1535
[LOSS Ex1] A: 0.63991 | B: 0.62565 | C: 0.62559
[LOGITS Ex2 A] Mean Abs: 2.138 | Max: 8.291
[LOSS Ex2] A: 0.11868 | B: 0.32845 | C: 0.24452
** [JOINT LOSS] ** : 0.860934
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003128 | Grad Max: 0.099117
  -> Layer: shared_layers.0.bias | Grad Mean: 0.196023 | Grad Max: 0.797530
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.006121
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006218 | Grad Max: 0.006218
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001362 | Grad Max: 0.338701
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024306 | Grad Max: 1.911547
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.006491
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008646 | Grad Max: 0.059074
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000218
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001573 | Grad Max: 0.004536
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000082
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000373 | Grad Max: 0.001018
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000384 | Grad Max: 0.000993
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005241 | Grad Max: 0.005241
[GRADIENT NORM TOTAL] 5.0170

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.946
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50734025 0.4926598 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.073
[MASKS] A(Pass/Fail): 712/1336 | B: 586/1270 | C: 506/1542
[LOSS Ex1] A: 0.64049 | B: 0.62996 | C: 0.62693
[LOGITS Ex2 A] Mean Abs: 2.092 | Max: 7.615
[LOSS Ex2] A: 0.12020 | B: 0.34366 | C: 0.27360
** [JOINT LOSS] ** : 0.878279
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004007 | Grad Max: 0.126769
  -> Layer: shared_layers.0.bias | Grad Mean: 0.378539 | Grad Max: 1.757590
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002289 | Grad Max: 0.006641
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010316 | Grad Max: 0.010316
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002261 | Grad Max: 0.453791
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041605 | Grad Max: 2.506737
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000279 | Grad Max: 0.008874
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020091 | Grad Max: 0.105333
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000333
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003951 | Grad Max: 0.008336
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000181
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001006 | Grad Max: 0.002521
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000757 | Grad Max: 0.001748
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018085 | Grad Max: 0.018085
[GRADIENT NORM TOTAL] 8.3907

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.892
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5102672  0.48973283] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.072
[MASKS] A(Pass/Fail): 707/1341 | B: 638/1410 | C: 515/1533
[LOSS Ex1] A: 0.63741 | B: 0.62915 | C: 0.62523
[LOGITS Ex2 A] Mean Abs: 2.102 | Max: 5.885
[LOSS Ex2] A: 0.12789 | B: 0.36112 | C: 0.24484
** [JOINT LOSS] ** : 0.875217
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003674 | Grad Max: 0.169449
  -> Layer: shared_layers.0.bias | Grad Mean: 0.408629 | Grad Max: 2.224521
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002213 | Grad Max: 0.006247
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000599 | Grad Max: 0.000599
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002483 | Grad Max: 0.514651
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045126 | Grad Max: 2.853621
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.011032
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020515 | Grad Max: 0.132069
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000353
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004029 | Grad Max: 0.008438
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000152
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001028 | Grad Max: 0.002419
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000710 | Grad Max: 0.001494
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018439 | Grad Max: 0.018439
[GRADIENT NORM TOTAL] 9.0799

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.920
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50482243 0.49517757] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.072
[MASKS] A(Pass/Fail): 714/1334 | B: 625/1423 | C: 525/1523
[LOSS Ex1] A: 0.63640 | B: 0.62983 | C: 0.62149
[LOGITS Ex2 A] Mean Abs: 2.111 | Max: 7.163
[LOSS Ex2] A: 0.13509 | B: 0.34013 | C: 0.23434
** [JOINT LOSS] ** : 0.865762
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005726 | Grad Max: 0.236953
  -> Layer: shared_layers.0.bias | Grad Mean: 0.133935 | Grad Max: 0.552369
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002261 | Grad Max: 0.006423
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000273 | Grad Max: 0.000273
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001155 | Grad Max: 0.347833
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019301 | Grad Max: 1.844001
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000115 | Grad Max: 0.004509
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007365 | Grad Max: 0.042698
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000306
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001754 | Grad Max: 0.004871
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000445 | Grad Max: 0.001274
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000360 | Grad Max: 0.001244
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006971 | Grad Max: 0.006971
[GRADIENT NORM TOTAL] 3.7945

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.762
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5019133 0.4980867] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.071
[MASKS] A(Pass/Fail): 681/1367 | B: 625/1423 | C: 537/1511
[LOSS Ex1] A: 0.64478 | B: 0.62547 | C: 0.62031
[LOGITS Ex2 A] Mean Abs: 2.099 | Max: 6.284
[LOSS Ex2] A: 0.11281 | B: 0.32663 | C: 0.24971
** [JOINT LOSS] ** : 0.859901
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002584 | Grad Max: 0.112120
  -> Layer: shared_layers.0.bias | Grad Mean: 0.244650 | Grad Max: 1.286583
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002268 | Grad Max: 0.006150
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007596 | Grad Max: 0.007597
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001512 | Grad Max: 0.290770
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027777 | Grad Max: 1.610811
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.007092
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012149 | Grad Max: 0.077968
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000249
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002427 | Grad Max: 0.005817
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000125
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000623 | Grad Max: 0.001731
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000459 | Grad Max: 0.001604
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011170 | Grad Max: 0.011170
[GRADIENT NORM TOTAL] 5.5823

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.683
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.543867   0.45613295] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.068
[MASKS] A(Pass/Fail): 680/1368 | B: 586/1270 | C: 549/1499
[LOSS Ex1] A: 0.64492 | B: 0.62978 | C: 0.62006
[LOGITS Ex2 A] Mean Abs: 2.047 | Max: 5.498
[LOSS Ex2] A: 0.12746 | B: 0.33337 | C: 0.22642
** [JOINT LOSS] ** : 0.860671
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002830 | Grad Max: 0.062332
  -> Layer: shared_layers.0.bias | Grad Mean: 0.176788 | Grad Max: 0.722882
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.005970
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006644 | Grad Max: 0.006644
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001357 | Grad Max: 0.483373
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024835 | Grad Max: 2.691286
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000162 | Grad Max: 0.007550
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011511 | Grad Max: 0.074518
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000343
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002340 | Grad Max: 0.007282
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000133
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000578 | Grad Max: 0.001817
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000434 | Grad Max: 0.001580
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009610 | Grad Max: 0.009610
[GRADIENT NORM TOTAL] 4.8680

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.856
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75755334 0.24244665] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.072
[MASKS] A(Pass/Fail): 742/1306 | B: 638/1410 | C: 536/1512
[LOSS Ex1] A: 0.63892 | B: 0.62898 | C: 0.61964
[LOGITS Ex2 A] Mean Abs: 2.128 | Max: 6.148
[LOSS Ex2] A: 0.11286 | B: 0.34943 | C: 0.23242
** [JOINT LOSS] ** : 0.860750
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001879 | Grad Max: 0.041778
  -> Layer: shared_layers.0.bias | Grad Mean: 0.099805 | Grad Max: 0.490642
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002332 | Grad Max: 0.006531
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006439 | Grad Max: 0.006439
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000672 | Grad Max: 0.474920
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011660 | Grad Max: 2.636407
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.003265
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001857 | Grad Max: 0.015689
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000114
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000327 | Grad Max: 0.002040
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000051
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000083 | Grad Max: 0.000551
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000326 | Grad Max: 0.000831
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001081 | Grad Max: 0.001081
[GRADIENT NORM TOTAL] 3.8033

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.949
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008488 0.4991512] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.071
[MASKS] A(Pass/Fail): 716/1332 | B: 626/1422 | C: 528/1520
[LOSS Ex1] A: 0.64557 | B: 0.62965 | C: 0.62048
[LOGITS Ex2 A] Mean Abs: 2.153 | Max: 5.766
[LOSS Ex2] A: 0.11593 | B: 0.34173 | C: 0.24269
** [JOINT LOSS] ** : 0.865348
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003672 | Grad Max: 0.130656
  -> Layer: shared_layers.0.bias | Grad Mean: 0.156219 | Grad Max: 0.743617
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005888
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003150 | Grad Max: 0.003150
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001116 | Grad Max: 0.253714
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018574 | Grad Max: 1.420228
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.004111
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002727 | Grad Max: 0.036392
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000148
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000329 | Grad Max: 0.002296
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000050
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000085 | Grad Max: 0.000569
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000321 | Grad Max: 0.000831
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000098 | Grad Max: 0.000098
[GRADIENT NORM TOTAL] 3.9851

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.663
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.70150274 0.2984972 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.072
[MASKS] A(Pass/Fail): 706/1342 | B: 625/1423 | C: 518/1530
[LOSS Ex1] A: 0.64138 | B: 0.62529 | C: 0.62922
[LOGITS Ex2 A] Mean Abs: 2.115 | Max: 5.811
[LOSS Ex2] A: 0.13789 | B: 0.33119 | C: 0.26169
** [JOINT LOSS] ** : 0.875558
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004388 | Grad Max: 0.119497
  -> Layer: shared_layers.0.bias | Grad Mean: 0.197790 | Grad Max: 0.850599
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005761
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000193 | Grad Max: 0.000193
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001457 | Grad Max: 0.357206
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025164 | Grad Max: 2.009839
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.004882
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008137 | Grad Max: 0.054543
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000180
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001546 | Grad Max: 0.004595
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000083
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000382 | Grad Max: 0.001091
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000258 | Grad Max: 0.001014
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006310 | Grad Max: 0.006310
[GRADIENT NORM TOTAL] 4.8915

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.767
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6228435  0.37715656] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.072
[MASKS] A(Pass/Fail): 587/1029 | B: 586/1270 | C: 374/1002
[LOSS Ex1] A: 0.63964 | B: 0.62961 | C: 0.62295
[LOGITS Ex2 A] Mean Abs: 2.168 | Max: 8.143
[LOSS Ex2] A: 0.11785 | B: 0.32551 | C: 0.27770
** [JOINT LOSS] ** : 0.871083
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004318 | Grad Max: 0.144405
  -> Layer: shared_layers.0.bias | Grad Mean: 0.269003 | Grad Max: 0.970389
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.006082
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003833 | Grad Max: 0.003833
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001710 | Grad Max: 0.341307
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030889 | Grad Max: 1.908977
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000201 | Grad Max: 0.007806
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014208 | Grad Max: 0.070728
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000312
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003032 | Grad Max: 0.006898
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000156
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000786 | Grad Max: 0.001989
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000673 | Grad Max: 0.001487
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014732 | Grad Max: 0.014732
[GRADIENT NORM TOTAL] 5.7052

[EPOCH SUMMARY] Train Loss: 0.8673

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8485 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8506 -> New: 0.8485)

############################## EPOCH 127/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.951
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50741833 0.4925817 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.073
[MASKS] A(Pass/Fail): 712/1336 | B: 638/1410 | C: 532/1516
[LOSS Ex1] A: 0.64024 | B: 0.62881 | C: 0.62270
[LOGITS Ex2 A] Mean Abs: 2.146 | Max: 8.052
[LOSS Ex2] A: 0.11707 | B: 0.35575 | C: 0.24977
** [JOINT LOSS] ** : 0.871444
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003007 | Grad Max: 0.134821
  -> Layer: shared_layers.0.bias | Grad Mean: 0.104221 | Grad Max: 0.448456
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.006444
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001940 | Grad Max: 0.001940
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000967 | Grad Max: 0.308257
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016111 | Grad Max: 1.702443
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.004641
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003447 | Grad Max: 0.040813
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000119
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000470 | Grad Max: 0.002704
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000055
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000126 | Grad Max: 0.000622
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000221 | Grad Max: 0.000698
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002706 | Grad Max: 0.002706
[GRADIENT NORM TOTAL] 3.3867

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.898
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5101735 0.4898265] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.072
[MASKS] A(Pass/Fail): 707/1341 | B: 627/1421 | C: 500/1548
[LOSS Ex1] A: 0.63715 | B: 0.62947 | C: 0.62535
[LOGITS Ex2 A] Mean Abs: 2.163 | Max: 5.265
[LOSS Ex2] A: 0.13043 | B: 0.34103 | C: 0.25449
** [JOINT LOSS] ** : 0.872642
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006828 | Grad Max: 0.212290
  -> Layer: shared_layers.0.bias | Grad Mean: 0.383471 | Grad Max: 1.442111
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002224 | Grad Max: 0.006231
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001975 | Grad Max: 0.001975
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002712 | Grad Max: 0.470730
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049490 | Grad Max: 2.624038
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000337 | Grad Max: 0.011384
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023829 | Grad Max: 0.124657
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000520
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005009 | Grad Max: 0.011195
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000231
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001261 | Grad Max: 0.003089
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000929 | Grad Max: 0.002192
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021668 | Grad Max: 0.021668
[GRADIENT NORM TOTAL] 8.3823

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.926
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5049619  0.49503812] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.072
[MASKS] A(Pass/Fail): 714/1334 | B: 625/1423 | C: 504/1544
[LOSS Ex1] A: 0.63614 | B: 0.62512 | C: 0.62413
[LOGITS Ex2 A] Mean Abs: 2.130 | Max: 7.850
[LOSS Ex2] A: 0.13873 | B: 0.33458 | C: 0.22906
** [JOINT LOSS] ** : 0.862587
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006308 | Grad Max: 0.253398
  -> Layer: shared_layers.0.bias | Grad Mean: 0.182261 | Grad Max: 0.638018
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002302 | Grad Max: 0.006521
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000302 | Grad Max: 0.000302
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001471 | Grad Max: 0.346287
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025166 | Grad Max: 1.904089
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000153 | Grad Max: 0.006095
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010327 | Grad Max: 0.058345
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000319
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002413 | Grad Max: 0.005867
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000131
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000622 | Grad Max: 0.001785
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000468 | Grad Max: 0.001695
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010640 | Grad Max: 0.010640
[GRADIENT NORM TOTAL] 4.5892

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.766
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5019249  0.49807513] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.071
[MASKS] A(Pass/Fail): 681/1367 | B: 587/1269 | C: 559/1489
[LOSS Ex1] A: 0.64453 | B: 0.62943 | C: 0.62076
[LOGITS Ex2 A] Mean Abs: 2.059 | Max: 6.195
[LOSS Ex2] A: 0.12073 | B: 0.34398 | C: 0.26219
** [JOINT LOSS] ** : 0.873877
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004424 | Grad Max: 0.171149
  -> Layer: shared_layers.0.bias | Grad Mean: 0.479756 | Grad Max: 2.250980
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.005591
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003947 | Grad Max: 0.003947
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003193 | Grad Max: 0.577851
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059689 | Grad Max: 3.257793
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.016781
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031412 | Grad Max: 0.179261
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000633
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006299 | Grad Max: 0.013369
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000284
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001603 | Grad Max: 0.003948
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001187 | Grad Max: 0.002381
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028609 | Grad Max: 0.028609
[GRADIENT NORM TOTAL] 10.8537

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.687
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.543837 0.456163] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.069
[MASKS] A(Pass/Fail): 680/1368 | B: 638/1410 | C: 530/1518
[LOSS Ex1] A: 0.64468 | B: 0.62863 | C: 0.62360
[LOGITS Ex2 A] Mean Abs: 2.051 | Max: 6.054
[LOSS Ex2] A: 0.13209 | B: 0.36166 | C: 0.24274
** [JOINT LOSS] ** : 0.877799
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005931 | Grad Max: 0.176405
  -> Layer: shared_layers.0.bias | Grad Mean: 0.479489 | Grad Max: 2.339409
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.005761
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008432 | Grad Max: 0.008432
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003257 | Grad Max: 0.553576
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060267 | Grad Max: 3.134782
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.016395
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029383 | Grad Max: 0.197911
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000590
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006098 | Grad Max: 0.013431
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000256
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001545 | Grad Max: 0.003652
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001153 | Grad Max: 0.002415
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027350 | Grad Max: 0.027350
[GRADIENT NORM TOTAL] 11.0472

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.861
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7587285  0.24127145] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.073
[MASKS] A(Pass/Fail): 742/1306 | B: 627/1421 | C: 533/1515
[LOSS Ex1] A: 0.63868 | B: 0.62931 | C: 0.62350
[LOGITS Ex2 A] Mean Abs: 2.137 | Max: 6.637
[LOSS Ex2] A: 0.11854 | B: 0.33715 | C: 0.24908
** [JOINT LOSS] ** : 0.865416
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002686 | Grad Max: 0.078366
  -> Layer: shared_layers.0.bias | Grad Mean: 0.090883 | Grad Max: 0.358620
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002207 | Grad Max: 0.006594
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003896 | Grad Max: 0.003896
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000858 | Grad Max: 0.185811
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014689 | Grad Max: 1.021797
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.004062
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002380 | Grad Max: 0.025830
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000121
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000336 | Grad Max: 0.002015
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000058
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000087 | Grad Max: 0.000512
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000345 | Grad Max: 0.000971
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000294 | Grad Max: 0.000294
[GRADIENT NORM TOTAL] 3.2112

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.954
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50082356 0.49917647] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.071
[MASKS] A(Pass/Fail): 716/1332 | B: 627/1421 | C: 522/1526
[LOSS Ex1] A: 0.64534 | B: 0.62495 | C: 0.62304
[LOGITS Ex2 A] Mean Abs: 2.153 | Max: 5.854
[LOSS Ex2] A: 0.10553 | B: 0.32409 | C: 0.23171
** [JOINT LOSS] ** : 0.851554
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004252 | Grad Max: 0.149725
  -> Layer: shared_layers.0.bias | Grad Mean: 0.354108 | Grad Max: 1.812570
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.005535
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000613 | Grad Max: 0.000613
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002409 | Grad Max: 0.345425
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043830 | Grad Max: 1.928179
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000300 | Grad Max: 0.014199
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021477 | Grad Max: 0.143479
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000357
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004087 | Grad Max: 0.009375
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000175
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001046 | Grad Max: 0.002378
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000690 | Grad Max: 0.002099
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018498 | Grad Max: 0.018498
[GRADIENT NORM TOTAL] 8.0195

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.667
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.70233244 0.29766756] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.072
[MASKS] A(Pass/Fail): 708/1340 | B: 588/1268 | C: 514/1534
[LOSS Ex1] A: 0.64115 | B: 0.62927 | C: 0.62501
[LOGITS Ex2 A] Mean Abs: 2.132 | Max: 6.845
[LOSS Ex2] A: 0.13196 | B: 0.31977 | C: 0.23611
** [JOINT LOSS] ** : 0.861090
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004226 | Grad Max: 0.140490
  -> Layer: shared_layers.0.bias | Grad Mean: 0.139578 | Grad Max: 0.630345
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.006123
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003142 | Grad Max: 0.003142
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001214 | Grad Max: 0.401176
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020184 | Grad Max: 2.250614
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.005339
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005306 | Grad Max: 0.053912
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000187
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000855 | Grad Max: 0.003276
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000238 | Grad Max: 0.000849
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000382 | Grad Max: 0.001091
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004282 | Grad Max: 0.004282
[GRADIENT NORM TOTAL] 4.2493

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.772
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62326705 0.37673295] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.073
[MASKS] A(Pass/Fail): 588/1028 | B: 639/1409 | C: 518/1530
[LOSS Ex1] A: 0.63940 | B: 0.62848 | C: 0.62524
[LOGITS Ex2 A] Mean Abs: 2.136 | Max: 7.115
[LOSS Ex2] A: 0.12387 | B: 0.34806 | C: 0.24925
** [JOINT LOSS] ** : 0.871435
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004938 | Grad Max: 0.135684
  -> Layer: shared_layers.0.bias | Grad Mean: 0.284065 | Grad Max: 1.212398
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.005938
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002998 | Grad Max: 0.002998
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001819 | Grad Max: 0.414730
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032804 | Grad Max: 2.324109
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.006963
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015858 | Grad Max: 0.080727
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000316
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003410 | Grad Max: 0.007253
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000164
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000882 | Grad Max: 0.002231
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000699 | Grad Max: 0.001648
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016234 | Grad Max: 0.016234
[GRADIENT NORM TOTAL] 6.1430

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.956
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074105  0.49258944] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.073
[MASKS] A(Pass/Fail): 712/1336 | B: 627/1421 | C: 546/1502
[LOSS Ex1] A: 0.64001 | B: 0.62916 | C: 0.62043
[LOGITS Ex2 A] Mean Abs: 2.107 | Max: 7.173
[LOSS Ex2] A: 0.11787 | B: 0.34057 | C: 0.25558
** [JOINT LOSS] ** : 0.867875
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001859 | Grad Max: 0.040924
  -> Layer: shared_layers.0.bias | Grad Mean: 0.075306 | Grad Max: 0.362504
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.005976
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000646 | Grad Max: 0.000646
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000744 | Grad Max: 0.320179
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012980 | Grad Max: 1.793447
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003210
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002999 | Grad Max: 0.034369
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000129
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000534 | Grad Max: 0.002855
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000066
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000135 | Grad Max: 0.000595
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000308 | Grad Max: 0.001062
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003730 | Grad Max: 0.003730
[GRADIENT NORM TOTAL] 3.1660

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.902
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51012504 0.48987496] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.073
[MASKS] A(Pass/Fail): 707/1341 | B: 627/1421 | C: 497/1551
[LOSS Ex1] A: 0.63692 | B: 0.62480 | C: 0.62594
[LOGITS Ex2 A] Mean Abs: 2.136 | Max: 5.546
[LOSS Ex2] A: 0.12075 | B: 0.32689 | C: 0.24621
** [JOINT LOSS] ** : 0.860508
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004895 | Grad Max: 0.169071
  -> Layer: shared_layers.0.bias | Grad Mean: 0.254663 | Grad Max: 1.450734
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.005931
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003508 | Grad Max: 0.003508
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001770 | Grad Max: 0.325539
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031450 | Grad Max: 1.821153
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000181 | Grad Max: 0.006624
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012598 | Grad Max: 0.073893
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000333
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002788 | Grad Max: 0.006574
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000124
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000708 | Grad Max: 0.001868
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000516 | Grad Max: 0.001633
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012378 | Grad Max: 0.012378
[GRADIENT NORM TOTAL] 6.0482

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.931
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.505041 0.494959] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.072
[MASKS] A(Pass/Fail): 714/1334 | B: 589/1267 | C: 534/1514
[LOSS Ex1] A: 0.63592 | B: 0.62912 | C: 0.62170
[LOGITS Ex2 A] Mean Abs: 2.118 | Max: 6.590
[LOSS Ex2] A: 0.14358 | B: 0.32609 | C: 0.24678
** [JOINT LOSS] ** : 0.867724
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004576 | Grad Max: 0.214866
  -> Layer: shared_layers.0.bias | Grad Mean: 0.107637 | Grad Max: 0.400812
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002244 | Grad Max: 0.006358
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002415 | Grad Max: 0.002415
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001071 | Grad Max: 0.298623
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017030 | Grad Max: 1.626941
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.003390
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002802 | Grad Max: 0.026889
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000179
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000621 | Grad Max: 0.003019
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000064
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000147 | Grad Max: 0.000615
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000254 | Grad Max: 0.000860
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001652 | Grad Max: 0.001652
[GRADIENT NORM TOTAL] 3.7537

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.770
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50189596 0.498104  ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.071
[MASKS] A(Pass/Fail): 681/1367 | B: 639/1409 | C: 523/1525
[LOSS Ex1] A: 0.64432 | B: 0.62833 | C: 0.62275
[LOGITS Ex2 A] Mean Abs: 2.065 | Max: 6.270
[LOSS Ex2] A: 0.11933 | B: 0.34711 | C: 0.23877
** [JOINT LOSS] ** : 0.866870
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002773 | Grad Max: 0.091100
  -> Layer: shared_layers.0.bias | Grad Mean: 0.215909 | Grad Max: 1.162116
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.005864
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000155 | Grad Max: 0.000155
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001582 | Grad Max: 0.464444
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029032 | Grad Max: 2.601628
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000181 | Grad Max: 0.008291
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012995 | Grad Max: 0.082491
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000251
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002645 | Grad Max: 0.005798
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000145
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000694 | Grad Max: 0.001996
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000597 | Grad Max: 0.001885
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013750 | Grad Max: 0.013750
[GRADIENT NORM TOTAL] 5.8381

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.691
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438614  0.45613867] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.069
[MASKS] A(Pass/Fail): 681/1367 | B: 627/1421 | C: 339/1037
[LOSS Ex1] A: 0.64448 | B: 0.62899 | C: 0.62260
[LOGITS Ex2 A] Mean Abs: 2.067 | Max: 5.261
[LOSS Ex2] A: 0.11889 | B: 0.33962 | C: 0.22981
** [JOINT LOSS] ** : 0.861467
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001855 | Grad Max: 0.033617
  -> Layer: shared_layers.0.bias | Grad Mean: 0.086932 | Grad Max: 0.457515
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005828
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005936 | Grad Max: 0.005936
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000689 | Grad Max: 0.358176
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012037 | Grad Max: 2.000298
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002572
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002043 | Grad Max: 0.017685
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000127
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000323 | Grad Max: 0.002430
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000057
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000082 | Grad Max: 0.000453
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000402 | Grad Max: 0.000882
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000449 | Grad Max: 0.000449
[GRADIENT NORM TOTAL] 3.3991

[EPOCH SUMMARY] Train Loss: 0.8666

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8482 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8485 -> New: 0.8482)

############################## EPOCH 128/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.865
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7599319  0.24006806] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.073
[MASKS] A(Pass/Fail): 744/1304 | B: 627/1421 | C: 503/1545
[LOSS Ex1] A: 0.63845 | B: 0.62463 | C: 0.62765
[LOGITS Ex2 A] Mean Abs: 2.138 | Max: 5.791
[LOSS Ex2] A: 0.10998 | B: 0.32057 | C: 0.24656
** [JOINT LOSS] ** : 0.855951
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002615 | Grad Max: 0.083204
  -> Layer: shared_layers.0.bias | Grad Mean: 0.205121 | Grad Max: 1.080095
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006403
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006141 | Grad Max: 0.006141
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001474 | Grad Max: 0.275856
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027116 | Grad Max: 1.531781
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.006757
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012599 | Grad Max: 0.080096
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000256
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002540 | Grad Max: 0.005972
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000124
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000634 | Grad Max: 0.001867
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000468 | Grad Max: 0.001679
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010563 | Grad Max: 0.010563
[GRADIENT NORM TOTAL] 5.0099

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.960
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008322  0.49916783] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.072
[MASKS] A(Pass/Fail): 716/1332 | B: 592/1264 | C: 518/1530
[LOSS Ex1] A: 0.64513 | B: 0.62895 | C: 0.62395
[LOGITS Ex2 A] Mean Abs: 2.138 | Max: 6.123
[LOSS Ex2] A: 0.11095 | B: 0.32394 | C: 0.23623
** [JOINT LOSS] ** : 0.856381
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003508 | Grad Max: 0.140954
  -> Layer: shared_layers.0.bias | Grad Mean: 0.092059 | Grad Max: 0.459773
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005727
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001036 | Grad Max: 0.001036
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000854 | Grad Max: 0.281484
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013835 | Grad Max: 1.583864
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000067 | Grad Max: 0.004304
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003880 | Grad Max: 0.031221
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000193
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000970 | Grad Max: 0.003881
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000085
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000233 | Grad Max: 0.000943
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000397 | Grad Max: 0.001150
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003841 | Grad Max: 0.003841
[GRADIENT NORM TOTAL] 2.9896

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.671
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.70319766 0.29680237] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.073
[MASKS] A(Pass/Fail): 708/1340 | B: 640/1408 | C: 524/1524
[LOSS Ex1] A: 0.64091 | B: 0.62815 | C: 0.62246
[LOGITS Ex2 A] Mean Abs: 2.131 | Max: 6.805
[LOSS Ex2] A: 0.14161 | B: 0.35807 | C: 0.22443
** [JOINT LOSS] ** : 0.871879
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004510 | Grad Max: 0.153747
  -> Layer: shared_layers.0.bias | Grad Mean: 0.153671 | Grad Max: 0.593802
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.006380
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000214 | Grad Max: 0.000214
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001250 | Grad Max: 0.145133
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021809 | Grad Max: 0.771722
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000141 | Grad Max: 0.005844
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009747 | Grad Max: 0.054967
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000303
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002134 | Grad Max: 0.005895
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000122
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000505 | Grad Max: 0.001543
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001424
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008024 | Grad Max: 0.008024
[GRADIENT NORM TOTAL] 3.4729

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.777
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62374854 0.37625143] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.073
[MASKS] A(Pass/Fail): 588/1028 | B: 628/1420 | C: 505/1543
[LOSS Ex1] A: 0.63916 | B: 0.62881 | C: 0.62502
[LOGITS Ex2 A] Mean Abs: 2.183 | Max: 7.842
[LOSS Ex2] A: 0.11775 | B: 0.34491 | C: 0.27732
** [JOINT LOSS] ** : 0.877656
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004135 | Grad Max: 0.154977
  -> Layer: shared_layers.0.bias | Grad Mean: 0.416544 | Grad Max: 2.050894
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006584
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010267 | Grad Max: 0.010267
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002671 | Grad Max: 0.398786
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049520 | Grad Max: 2.208066
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000331 | Grad Max: 0.011998
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024121 | Grad Max: 0.129058
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000476
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004836 | Grad Max: 0.011438
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000219
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001184 | Grad Max: 0.003023
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000761 | Grad Max: 0.001822
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018718 | Grad Max: 0.018718
[GRADIENT NORM TOTAL] 9.1739

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.962
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50740767 0.49259233] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.073
[MASKS] A(Pass/Fail): 714/1334 | B: 628/1420 | C: 527/1521
[LOSS Ex1] A: 0.63977 | B: 0.62444 | C: 0.61964
[LOGITS Ex2 A] Mean Abs: 2.171 | Max: 7.861
[LOSS Ex2] A: 0.11687 | B: 0.32640 | C: 0.25469
** [JOINT LOSS] ** : 0.860605
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005442 | Grad Max: 0.178751
  -> Layer: shared_layers.0.bias | Grad Mean: 0.344632 | Grad Max: 1.360460
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002347 | Grad Max: 0.006267
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002449 | Grad Max: 0.002449
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002414 | Grad Max: 0.406335
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043773 | Grad Max: 2.281309
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000294 | Grad Max: 0.012798
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021041 | Grad Max: 0.128095
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000414
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004447 | Grad Max: 0.009092
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000199
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001140 | Grad Max: 0.002754
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000803 | Grad Max: 0.002157
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020009 | Grad Max: 0.020009
[GRADIENT NORM TOTAL] 7.7682

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.907
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5101135  0.48988655] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.073
[MASKS] A(Pass/Fail): 707/1341 | B: 592/1264 | C: 549/1499
[LOSS Ex1] A: 0.63667 | B: 0.62876 | C: 0.62073
[LOGITS Ex2 A] Mean Abs: 2.133 | Max: 6.342
[LOSS Ex2] A: 0.12590 | B: 0.33511 | C: 0.24792
** [JOINT LOSS] ** : 0.865027
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003479 | Grad Max: 0.092943
  -> Layer: shared_layers.0.bias | Grad Mean: 0.234595 | Grad Max: 0.930831
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.007138
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006234 | Grad Max: 0.006234
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001488 | Grad Max: 0.181911
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026194 | Grad Max: 1.008165
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000157 | Grad Max: 0.006365
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011258 | Grad Max: 0.064774
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000259
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002097 | Grad Max: 0.005365
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000115
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000522 | Grad Max: 0.001540
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000344 | Grad Max: 0.001248
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008976 | Grad Max: 0.008976
[GRADIENT NORM TOTAL] 4.6507

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.936
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5050783  0.49492168] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.073
[MASKS] A(Pass/Fail): 714/1334 | B: 640/1408 | C: 527/1521
[LOSS Ex1] A: 0.63566 | B: 0.62797 | C: 0.62463
[LOGITS Ex2 A] Mean Abs: 2.121 | Max: 6.627
[LOSS Ex2] A: 0.13223 | B: 0.34635 | C: 0.24180
** [JOINT LOSS] ** : 0.869544
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004543 | Grad Max: 0.176911
  -> Layer: shared_layers.0.bias | Grad Mean: 0.248644 | Grad Max: 1.137499
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002219 | Grad Max: 0.006122
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002953 | Grad Max: 0.002953
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001701 | Grad Max: 0.253466
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029109 | Grad Max: 1.391820
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.007409
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011723 | Grad Max: 0.091462
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000201
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002065 | Grad Max: 0.005173
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000548 | Grad Max: 0.001451
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000389 | Grad Max: 0.001177
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010707 | Grad Max: 0.010707
[GRADIENT NORM TOTAL] 5.4486

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.774
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50182736 0.49817264] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.072
[MASKS] A(Pass/Fail): 681/1367 | B: 629/1419 | C: 534/1514
[LOSS Ex1] A: 0.64409 | B: 0.62864 | C: 0.62178
[LOGITS Ex2 A] Mean Abs: 2.122 | Max: 7.454
[LOSS Ex2] A: 0.11489 | B: 0.34998 | C: 0.22742
** [JOINT LOSS] ** : 0.862265
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003986 | Grad Max: 0.122462
  -> Layer: shared_layers.0.bias | Grad Mean: 0.151273 | Grad Max: 0.604920
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005786
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006095 | Grad Max: 0.006095
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001135 | Grad Max: 0.240657
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019734 | Grad Max: 1.360889
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000095 | Grad Max: 0.003864
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006449 | Grad Max: 0.035613
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000228
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001510 | Grad Max: 0.004377
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000087
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000392 | Grad Max: 0.001138
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000370 | Grad Max: 0.001240
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006395 | Grad Max: 0.006395
[GRADIENT NORM TOTAL] 3.8689

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.695
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439652  0.45603484] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.069
[MASKS] A(Pass/Fail): 682/1366 | B: 631/1417 | C: 501/1547
[LOSS Ex1] A: 0.64425 | B: 0.62427 | C: 0.62642
[LOGITS Ex2 A] Mean Abs: 2.108 | Max: 6.342
[LOSS Ex2] A: 0.13127 | B: 0.32761 | C: 0.22973
** [JOINT LOSS] ** : 0.861184
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003660 | Grad Max: 0.126367
  -> Layer: shared_layers.0.bias | Grad Mean: 0.154341 | Grad Max: 0.614713
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002154 | Grad Max: 0.005801
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008483 | Grad Max: 0.008483
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001150 | Grad Max: 0.363852
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019773 | Grad Max: 2.064565
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.004569
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004235 | Grad Max: 0.039699
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000158
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000685 | Grad Max: 0.002561
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000202 | Grad Max: 0.000737
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000355 | Grad Max: 0.001016
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004284 | Grad Max: 0.004284
[GRADIENT NORM TOTAL] 4.6306

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.870
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7613682  0.23863181] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.073
[MASKS] A(Pass/Fail): 745/1303 | B: 592/1264 | C: 526/1522
[LOSS Ex1] A: 0.63820 | B: 0.62859 | C: 0.62202
[LOGITS Ex2 A] Mean Abs: 2.137 | Max: 6.818
[LOSS Ex2] A: 0.11236 | B: 0.32666 | C: 0.25667
** [JOINT LOSS] ** : 0.861501
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.041869
  -> Layer: shared_layers.0.bias | Grad Mean: 0.102976 | Grad Max: 0.504307
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002330 | Grad Max: 0.006193
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010882 | Grad Max: 0.010882
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000807 | Grad Max: 0.329785
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014378 | Grad Max: 1.849806
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002703
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002187 | Grad Max: 0.026585
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000131
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000362 | Grad Max: 0.002093
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000049
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000099 | Grad Max: 0.000457
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000301 | Grad Max: 0.000737
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000581 | Grad Max: 0.000581
[GRADIENT NORM TOTAL] 3.9964

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.965
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50083625 0.4991637 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.072
[MASKS] A(Pass/Fail): 716/1332 | B: 640/1408 | C: 545/1503
[LOSS Ex1] A: 0.64490 | B: 0.62780 | C: 0.62135
[LOGITS Ex2 A] Mean Abs: 2.146 | Max: 5.835
[LOSS Ex2] A: 0.10832 | B: 0.34493 | C: 0.23973
** [JOINT LOSS] ** : 0.862343
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005429 | Grad Max: 0.210309
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207324 | Grad Max: 0.913979
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002143 | Grad Max: 0.005712
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000847 | Grad Max: 0.000847
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.438434
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025096 | Grad Max: 2.476779
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000139 | Grad Max: 0.004634
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009247 | Grad Max: 0.041349
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000286
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002160 | Grad Max: 0.005931
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000133
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000541 | Grad Max: 0.001524
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000507 | Grad Max: 0.001745
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009904 | Grad Max: 0.009904
[GRADIENT NORM TOTAL] 5.0769

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.676
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.70413005 0.29587   ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.073
[MASKS] A(Pass/Fail): 709/1339 | B: 630/1418 | C: 550/1498
[LOSS Ex1] A: 0.64067 | B: 0.62846 | C: 0.61806
[LOGITS Ex2 A] Mean Abs: 2.155 | Max: 6.460
[LOSS Ex2] A: 0.12937 | B: 0.34655 | C: 0.23056
** [JOINT LOSS] ** : 0.864550
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004432 | Grad Max: 0.152575
  -> Layer: shared_layers.0.bias | Grad Mean: 0.209173 | Grad Max: 1.272265
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006150
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008407 | Grad Max: 0.008407
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001384 | Grad Max: 0.247984
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023776 | Grad Max: 1.325265
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.004987
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006416 | Grad Max: 0.048588
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000174
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001005 | Grad Max: 0.003526
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000098
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000223 | Grad Max: 0.000975
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000293 | Grad Max: 0.000869
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002397 | Grad Max: 0.002397
[GRADIENT NORM TOTAL] 4.6348

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.782
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62425774 0.37574223] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.073
[MASKS] A(Pass/Fail): 588/1028 | B: 631/1417 | C: 517/1531
[LOSS Ex1] A: 0.63892 | B: 0.62409 | C: 0.62466
[LOGITS Ex2 A] Mean Abs: 2.207 | Max: 7.404
[LOSS Ex2] A: 0.12043 | B: 0.32705 | C: 0.23070
** [JOINT LOSS] ** : 0.855281
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002554 | Grad Max: 0.088602
  -> Layer: shared_layers.0.bias | Grad Mean: 0.143342 | Grad Max: 0.875093
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.005790
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005515 | Grad Max: 0.005515
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000958 | Grad Max: 0.184235
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016698 | Grad Max: 1.021310
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.004360
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005594 | Grad Max: 0.042428
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000174
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000994 | Grad Max: 0.003395
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000093
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000246 | Grad Max: 0.001087
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000310 | Grad Max: 0.001112
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003575 | Grad Max: 0.003575
[GRADIENT NORM TOTAL] 3.2787

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.967
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50739634 0.49260363] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.074
[MASKS] A(Pass/Fail): 714/1334 | B: 592/1264 | C: 370/1006
[LOSS Ex1] A: 0.63952 | B: 0.62840 | C: 0.61917
[LOGITS Ex2 A] Mean Abs: 2.152 | Max: 8.942
[LOSS Ex2] A: 0.11609 | B: 0.33557 | C: 0.25745
** [JOINT LOSS] ** : 0.865400
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003068 | Grad Max: 0.107620
  -> Layer: shared_layers.0.bias | Grad Mean: 0.281411 | Grad Max: 1.268841
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.006222
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000314 | Grad Max: 0.000314
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001735 | Grad Max: 0.508929
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031363 | Grad Max: 2.837721
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.009785
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014486 | Grad Max: 0.103243
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000239
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002747 | Grad Max: 0.006054
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000138
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000724 | Grad Max: 0.001883
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000582 | Grad Max: 0.001766
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013887 | Grad Max: 0.013887
[GRADIENT NORM TOTAL] 6.3713

[EPOCH SUMMARY] Train Loss: 0.8635

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8463 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8482 -> New: 0.8463)

############################## EPOCH 129/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.913
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5100385 0.4899615] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.073
[MASKS] A(Pass/Fail): 708/1340 | B: 640/1408 | C: 553/1495
[LOSS Ex1] A: 0.63641 | B: 0.62761 | C: 0.61910
[LOGITS Ex2 A] Mean Abs: 2.161 | Max: 6.016
[LOSS Ex2] A: 0.12278 | B: 0.35157 | C: 0.23747
** [JOINT LOSS] ** : 0.864985
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005707 | Grad Max: 0.207629
  -> Layer: shared_layers.0.bias | Grad Mean: 0.211517 | Grad Max: 0.984209
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.006585
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003535 | Grad Max: 0.003535
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001432 | Grad Max: 0.526078
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023650 | Grad Max: 2.937972
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000092 | Grad Max: 0.006379
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004580 | Grad Max: 0.049479
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000132
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000558 | Grad Max: 0.002835
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000069
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000142 | Grad Max: 0.000805
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000187 | Grad Max: 0.000672
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002181 | Grad Max: 0.002181
[GRADIENT NORM TOTAL] 5.3454

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.942
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5051503  0.49484965] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.073
[MASKS] A(Pass/Fail): 715/1333 | B: 630/1418 | C: 553/1495
[LOSS Ex1] A: 0.63540 | B: 0.62827 | C: 0.61759
[LOGITS Ex2 A] Mean Abs: 2.142 | Max: 8.446
[LOSS Ex2] A: 0.14214 | B: 0.35194 | C: 0.22494
** [JOINT LOSS] ** : 0.866759
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007885 | Grad Max: 0.344048
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207825 | Grad Max: 0.667572
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002270 | Grad Max: 0.006492
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003923 | Grad Max: 0.003923
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001698 | Grad Max: 0.296531
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029015 | Grad Max: 1.611314
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000182 | Grad Max: 0.006257
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011960 | Grad Max: 0.056236
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000358
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002790 | Grad Max: 0.006602
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000149
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000710 | Grad Max: 0.001774
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000552 | Grad Max: 0.001509
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011918 | Grad Max: 0.011918
[GRADIENT NORM TOTAL] 5.2166

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.779
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017795  0.49822047] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.555 | Std: 0.072
[MASKS] A(Pass/Fail): 681/1367 | B: 633/1415 | C: 537/1511
[LOSS Ex1] A: 0.64385 | B: 0.62390 | C: 0.62108
[LOGITS Ex2 A] Mean Abs: 2.108 | Max: 5.979
[LOSS Ex2] A: 0.11353 | B: 0.32649 | C: 0.22647
** [JOINT LOSS] ** : 0.851773
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002512 | Grad Max: 0.068685
  -> Layer: shared_layers.0.bias | Grad Mean: 0.085445 | Grad Max: 0.333977
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.005882
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006021 | Grad Max: 0.006021
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000768 | Grad Max: 0.172553
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013078 | Grad Max: 0.955764
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.003502
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003696 | Grad Max: 0.031683
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000148
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000611 | Grad Max: 0.003068
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000149 | Grad Max: 0.000839
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000464 | Grad Max: 0.001069
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001594 | Grad Max: 0.001594
[GRADIENT NORM TOTAL] 2.4660

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.700
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439862  0.45601383] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.554 | Std: 0.070
[MASKS] A(Pass/Fail): 682/1366 | B: 592/1264 | C: 529/1519
[LOSS Ex1] A: 0.64402 | B: 0.62821 | C: 0.62181
[LOGITS Ex2 A] Mean Abs: 2.101 | Max: 5.940
[LOSS Ex2] A: 0.11997 | B: 0.31643 | C: 0.24345
** [JOINT LOSS] ** : 0.857961
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002782 | Grad Max: 0.077492
  -> Layer: shared_layers.0.bias | Grad Mean: 0.094337 | Grad Max: 0.517804
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.006151
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009549 | Grad Max: 0.009549
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000745 | Grad Max: 0.261643
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012918 | Grad Max: 1.455987
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.002768
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002723 | Grad Max: 0.021789
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000147
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000535 | Grad Max: 0.002740
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000136 | Grad Max: 0.000658
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000354 | Grad Max: 0.001292
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002532 | Grad Max: 0.002533
[GRADIENT NORM TOTAL] 3.0685

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.876
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7627915  0.23720849] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.074
[MASKS] A(Pass/Fail): 745/1303 | B: 641/1407 | C: 547/1501
[LOSS Ex1] A: 0.63794 | B: 0.62742 | C: 0.61778
[LOGITS Ex2 A] Mean Abs: 2.166 | Max: 6.109
[LOSS Ex2] A: 0.11464 | B: 0.35120 | C: 0.25397
** [JOINT LOSS] ** : 0.867652
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003156 | Grad Max: 0.097638
  -> Layer: shared_layers.0.bias | Grad Mean: 0.233562 | Grad Max: 1.214011
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002258 | Grad Max: 0.005944
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001078 | Grad Max: 0.001078
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001694 | Grad Max: 0.370108
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030379 | Grad Max: 2.070881
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000180 | Grad Max: 0.009348
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012966 | Grad Max: 0.097438
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000258
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002396 | Grad Max: 0.005973
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000132
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000581 | Grad Max: 0.001831
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000323 | Grad Max: 0.001119
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009025 | Grad Max: 0.009025
[GRADIENT NORM TOTAL] 5.8435

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.971
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50080895 0.49919108] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.072
[MASKS] A(Pass/Fail): 716/1332 | B: 630/1418 | C: 514/1534
[LOSS Ex1] A: 0.64465 | B: 0.62808 | C: 0.62027
[LOGITS Ex2 A] Mean Abs: 2.173 | Max: 5.905
[LOSS Ex2] A: 0.10983 | B: 0.34399 | C: 0.24043
** [JOINT LOSS] ** : 0.862414
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005196 | Grad Max: 0.189595
  -> Layer: shared_layers.0.bias | Grad Mean: 0.148567 | Grad Max: 0.507771
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.005805
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006182 | Grad Max: 0.006182
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001067 | Grad Max: 0.418932
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017068 | Grad Max: 2.363113
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.003650
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003527 | Grad Max: 0.027901
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000226
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000927 | Grad Max: 0.003346
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000246 | Grad Max: 0.000888
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000361 | Grad Max: 0.001304
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005536 | Grad Max: 0.005536
[GRADIENT NORM TOTAL] 4.1787

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.681
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.70503616 0.29496378] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.073
[MASKS] A(Pass/Fail): 709/1339 | B: 633/1415 | C: 528/1520
[LOSS Ex1] A: 0.64039 | B: 0.62370 | C: 0.62203
[LOGITS Ex2 A] Mean Abs: 2.145 | Max: 6.514
[LOSS Ex2] A: 0.13376 | B: 0.31815 | C: 0.22229
** [JOINT LOSS] ** : 0.853443
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006961 | Grad Max: 0.228961
  -> Layer: shared_layers.0.bias | Grad Mean: 0.214614 | Grad Max: 0.749895
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002213 | Grad Max: 0.006227
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000680 | Grad Max: 0.000680
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001285 | Grad Max: 0.504285
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021311 | Grad Max: 2.806099
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.004544
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006559 | Grad Max: 0.038096
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000266
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001626 | Grad Max: 0.004341
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000106
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000416 | Grad Max: 0.001285
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.001492
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007309 | Grad Max: 0.007309
[GRADIENT NORM TOTAL] 5.6035

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.787
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62469965 0.3753004 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.074
[MASKS] A(Pass/Fail): 588/1028 | B: 592/1264 | C: 554/1494
[LOSS Ex1] A: 0.63863 | B: 0.62802 | C: 0.62388
[LOGITS Ex2 A] Mean Abs: 2.210 | Max: 7.787
[LOSS Ex2] A: 0.11685 | B: 0.31523 | C: 0.24967
** [JOINT LOSS] ** : 0.857424
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003356 | Grad Max: 0.129512
  -> Layer: shared_layers.0.bias | Grad Mean: 0.147307 | Grad Max: 0.570003
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.006253
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001580 | Grad Max: 0.001580
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000845 | Grad Max: 0.410632
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013884 | Grad Max: 2.279090
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003057
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002126 | Grad Max: 0.024275
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000133
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000386 | Grad Max: 0.002199
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000062
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000099 | Grad Max: 0.000625
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000262 | Grad Max: 0.000788
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001169 | Grad Max: 0.001169
[GRADIENT NORM TOTAL] 4.3381

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.974
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074751 0.4925249] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.074
[MASKS] A(Pass/Fail): 714/1334 | B: 642/1406 | C: 513/1535
[LOSS Ex1] A: 0.63924 | B: 0.62723 | C: 0.62447
[LOGITS Ex2 A] Mean Abs: 2.208 | Max: 8.700
[LOSS Ex2] A: 0.11753 | B: 0.34012 | C: 0.24563
** [JOINT LOSS] ** : 0.864747
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004040 | Grad Max: 0.114439
  -> Layer: shared_layers.0.bias | Grad Mean: 0.220191 | Grad Max: 1.464141
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.006031
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003742 | Grad Max: 0.003742
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001500 | Grad Max: 0.283386
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027368 | Grad Max: 1.588432
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000204 | Grad Max: 0.007690
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014592 | Grad Max: 0.080773
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000313
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002986 | Grad Max: 0.007426
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000139
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000710 | Grad Max: 0.001927
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000472 | Grad Max: 0.001339
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011050 | Grad Max: 0.011050
[GRADIENT NORM TOTAL] 4.9043

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.919
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50987405 0.49012598] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.074
[MASKS] A(Pass/Fail): 708/1340 | B: 630/1418 | C: 508/1540
[LOSS Ex1] A: 0.63612 | B: 0.62789 | C: 0.62564
[LOGITS Ex2 A] Mean Abs: 2.178 | Max: 6.038
[LOSS Ex2] A: 0.12251 | B: 0.33402 | C: 0.24135
** [JOINT LOSS] ** : 0.862506
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004917 | Grad Max: 0.178054
  -> Layer: shared_layers.0.bias | Grad Mean: 0.219822 | Grad Max: 0.897702
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.005854
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000348 | Grad Max: 0.000348
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001460 | Grad Max: 0.199337
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024761 | Grad Max: 1.061628
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000138 | Grad Max: 0.005404
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009038 | Grad Max: 0.053103
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001454 | Grad Max: 0.004242
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000085
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000360 | Grad Max: 0.001154
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000278 | Grad Max: 0.001024
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006434 | Grad Max: 0.006434
[GRADIENT NORM TOTAL] 4.5771

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.948
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50530034 0.49469963] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.073
[MASKS] A(Pass/Fail): 715/1333 | B: 635/1413 | C: 502/1546
[LOSS Ex1] A: 0.63510 | B: 0.62351 | C: 0.62582
[LOGITS Ex2 A] Mean Abs: 2.136 | Max: 6.846
[LOSS Ex2] A: 0.13957 | B: 0.33385 | C: 0.23885
** [JOINT LOSS] ** : 0.865569
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005888 | Grad Max: 0.259829
  -> Layer: shared_layers.0.bias | Grad Mean: 0.174381 | Grad Max: 0.675488
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002332 | Grad Max: 0.006217
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006307 | Grad Max: 0.006307
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001432 | Grad Max: 0.155981
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023032 | Grad Max: 0.828638
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000107 | Grad Max: 0.004686
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005247 | Grad Max: 0.050639
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000165
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000684 | Grad Max: 0.002984
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000057
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000200 | Grad Max: 0.000850
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000177 | Grad Max: 0.000821
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004996 | Grad Max: 0.004996
[GRADIENT NORM TOTAL] 3.9123

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.784
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.501828 0.498172] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.072
[MASKS] A(Pass/Fail): 681/1367 | B: 592/1264 | C: 539/1509
[LOSS Ex1] A: 0.64357 | B: 0.62782 | C: 0.62068
[LOGITS Ex2 A] Mean Abs: 2.130 | Max: 6.576
[LOSS Ex2] A: 0.12014 | B: 0.32041 | C: 0.25677
** [JOINT LOSS] ** : 0.863130
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003888 | Grad Max: 0.113401
  -> Layer: shared_layers.0.bias | Grad Mean: 0.123345 | Grad Max: 0.521762
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005793
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002955 | Grad Max: 0.002955
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000975 | Grad Max: 0.292307
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016981 | Grad Max: 1.642169
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000066 | Grad Max: 0.003061
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003712 | Grad Max: 0.026231
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000179
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000787 | Grad Max: 0.003067
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000076
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000183 | Grad Max: 0.000840
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000257 | Grad Max: 0.000867
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002452 | Grad Max: 0.002452
[GRADIENT NORM TOTAL] 3.8777

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.704
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54393697 0.45606303] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.070
[MASKS] A(Pass/Fail): 683/1365 | B: 642/1406 | C: 526/1522
[LOSS Ex1] A: 0.64375 | B: 0.62705 | C: 0.62291
[LOGITS Ex2 A] Mean Abs: 2.122 | Max: 6.332
[LOSS Ex2] A: 0.12625 | B: 0.34263 | C: 0.25152
** [JOINT LOSS] ** : 0.871369
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.057211
  -> Layer: shared_layers.0.bias | Grad Mean: 0.115348 | Grad Max: 0.564649
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.005507
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005440 | Grad Max: 0.005440
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000860 | Grad Max: 0.181933
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014978 | Grad Max: 1.017726
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.004595
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004542 | Grad Max: 0.037667
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000179
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000805 | Grad Max: 0.003744
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000081
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000171 | Grad Max: 0.000914
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000209 | Grad Max: 0.000624
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001138 | Grad Max: 0.001138
[GRADIENT NORM TOTAL] 3.0166

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.881
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7641752 0.2358248] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.074
[MASKS] A(Pass/Fail): 745/1303 | B: 631/1417 | C: 341/1035
[LOSS Ex1] A: 0.63767 | B: 0.62770 | C: 0.62957
[LOGITS Ex2 A] Mean Abs: 2.152 | Max: 6.342
[LOSS Ex2] A: 0.11494 | B: 0.34413 | C: 0.27534
** [JOINT LOSS] ** : 0.876448
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004239 | Grad Max: 0.123743
  -> Layer: shared_layers.0.bias | Grad Mean: 0.100387 | Grad Max: 0.535738
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.006120
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010006 | Grad Max: 0.010006
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000926 | Grad Max: 0.136823
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015634 | Grad Max: 0.760092
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.003419
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004232 | Grad Max: 0.026185
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000181
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000957 | Grad Max: 0.003331
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000085
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000255 | Grad Max: 0.000936
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000287 | Grad Max: 0.001055
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005638 | Grad Max: 0.005638
[GRADIENT NORM TOTAL] 2.7645

[EPOCH SUMMARY] Train Loss: 0.8633

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8453 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8463 -> New: 0.8453)

############################## EPOCH 130/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.978
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50072485 0.49927515] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.073
[MASKS] A(Pass/Fail): 716/1332 | B: 635/1413 | C: 537/1511
[LOSS Ex1] A: 0.64440 | B: 0.62331 | C: 0.62227
[LOGITS Ex2 A] Mean Abs: 2.158 | Max: 6.053
[LOSS Ex2] A: 0.10896 | B: 0.33039 | C: 0.23933
** [JOINT LOSS] ** : 0.856222
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003395 | Grad Max: 0.120065
  -> Layer: shared_layers.0.bias | Grad Mean: 0.071377 | Grad Max: 0.448864
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.005741
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005791 | Grad Max: 0.005791
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000764 | Grad Max: 0.172814
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012093 | Grad Max: 0.970880
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.003039
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002993 | Grad Max: 0.027614
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000196
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000605 | Grad Max: 0.002633
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000066
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000147 | Grad Max: 0.000607
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000356 | Grad Max: 0.001075
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002982 | Grad Max: 0.002982
[GRADIENT NORM TOTAL] 2.4859

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.685
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.70592654 0.29407352] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.074
[MASKS] A(Pass/Fail): 709/1339 | B: 593/1263 | C: 543/1505
[LOSS Ex1] A: 0.64012 | B: 0.62762 | C: 0.62050
[LOGITS Ex2 A] Mean Abs: 2.135 | Max: 6.004
[LOSS Ex2] A: 0.13431 | B: 0.31725 | C: 0.22679
** [JOINT LOSS] ** : 0.855529
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002886 | Grad Max: 0.077082
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134074 | Grad Max: 0.675476
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.006513
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000110 | Grad Max: 0.000110
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000962 | Grad Max: 0.413684
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016403 | Grad Max: 2.310817
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.002844
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002479 | Grad Max: 0.023660
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000135
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000408 | Grad Max: 0.002388
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000078
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000123 | Grad Max: 0.000738
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000390 | Grad Max: 0.001019
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001882 | Grad Max: 0.001882
[GRADIENT NORM TOTAL] 4.6541

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.793
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6251188  0.37488124] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.074
[MASKS] A(Pass/Fail): 588/1028 | B: 643/1405 | C: 510/1538
[LOSS Ex1] A: 0.63835 | B: 0.62684 | C: 0.62632
[LOGITS Ex2 A] Mean Abs: 2.203 | Max: 9.788
[LOSS Ex2] A: 0.12182 | B: 0.34347 | C: 0.25285
** [JOINT LOSS] ** : 0.869886
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003486 | Grad Max: 0.104576
  -> Layer: shared_layers.0.bias | Grad Mean: 0.143131 | Grad Max: 0.576087
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006128
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000117 | Grad Max: 0.000117
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001290 | Grad Max: 0.352313
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023192 | Grad Max: 1.994053
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000149 | Grad Max: 0.004731
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010591 | Grad Max: 0.055500
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000326
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002174 | Grad Max: 0.006324
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000123
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000510 | Grad Max: 0.001407
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000262 | Grad Max: 0.000994
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006779 | Grad Max: 0.006779
[GRADIENT NORM TOTAL] 4.3449

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.980
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50753254 0.49246752] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.074
[MASKS] A(Pass/Fail): 714/1334 | B: 632/1416 | C: 529/1519
[LOSS Ex1] A: 0.63897 | B: 0.62748 | C: 0.62497
[LOGITS Ex2 A] Mean Abs: 2.159 | Max: 8.713
[LOSS Ex2] A: 0.10929 | B: 0.34811 | C: 0.26088
** [JOINT LOSS] ** : 0.869902
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003678 | Grad Max: 0.132777
  -> Layer: shared_layers.0.bias | Grad Mean: 0.238907 | Grad Max: 1.305689
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005617
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000951 | Grad Max: 0.000951
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001506 | Grad Max: 0.436207
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026568 | Grad Max: 2.393966
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000128 | Grad Max: 0.006021
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008904 | Grad Max: 0.069545
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000223
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001604 | Grad Max: 0.004413
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000102
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000406 | Grad Max: 0.001453
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.001245
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008063 | Grad Max: 0.008063
[GRADIENT NORM TOTAL] 5.8689

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.925
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50980896 0.49019104] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.074
[MASKS] A(Pass/Fail): 708/1340 | B: 636/1412 | C: 536/1512
[LOSS Ex1] A: 0.63584 | B: 0.62310 | C: 0.62221
[LOGITS Ex2 A] Mean Abs: 2.131 | Max: 5.913
[LOSS Ex2] A: 0.12749 | B: 0.32674 | C: 0.23725
** [JOINT LOSS] ** : 0.857542
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004356 | Grad Max: 0.159600
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141358 | Grad Max: 0.524654
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002293 | Grad Max: 0.006593
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003587 | Grad Max: 0.003587
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001130 | Grad Max: 0.132348
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018185 | Grad Max: 0.743231
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.006322
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004238 | Grad Max: 0.047662
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000139
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000514 | Grad Max: 0.002998
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000107 | Grad Max: 0.000735
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000318 | Grad Max: 0.001001
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001002 | Grad Max: 0.001002
[GRADIENT NORM TOTAL] 3.1682

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.954
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5054078  0.49459222] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.074
[MASKS] A(Pass/Fail): 715/1333 | B: 593/1263 | C: 516/1532
[LOSS Ex1] A: 0.63483 | B: 0.62740 | C: 0.62052
[LOGITS Ex2 A] Mean Abs: 2.137 | Max: 7.530
[LOSS Ex2] A: 0.13936 | B: 0.32259 | C: 0.24096
** [JOINT LOSS] ** : 0.861887
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005779 | Grad Max: 0.232786
  -> Layer: shared_layers.0.bias | Grad Mean: 0.202409 | Grad Max: 0.830745
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002251 | Grad Max: 0.006196
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001672 | Grad Max: 0.001672
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001482 | Grad Max: 0.308772
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025416 | Grad Max: 1.709780
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000149 | Grad Max: 0.006187
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010044 | Grad Max: 0.068996
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000248
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002248 | Grad Max: 0.005521
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000124
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000572 | Grad Max: 0.001645
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000442 | Grad Max: 0.001384
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010187 | Grad Max: 0.010187
[GRADIENT NORM TOTAL] 4.9830

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.789
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017989 0.4982011] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.073
[MASKS] A(Pass/Fail): 682/1366 | B: 643/1405 | C: 559/1489
[LOSS Ex1] A: 0.64331 | B: 0.62663 | C: 0.61858
[LOGITS Ex2 A] Mean Abs: 2.105 | Max: 5.906
[LOSS Ex2] A: 0.11491 | B: 0.34067 | C: 0.22505
** [JOINT LOSS] ** : 0.856381
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002772 | Grad Max: 0.094689
  -> Layer: shared_layers.0.bias | Grad Mean: 0.093005 | Grad Max: 0.371983
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002162 | Grad Max: 0.006206
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008489 | Grad Max: 0.008489
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000761 | Grad Max: 0.189840
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012824 | Grad Max: 1.039045
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.003428
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002259 | Grad Max: 0.019764
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000150
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000442 | Grad Max: 0.003347
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000113 | Grad Max: 0.000784
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000455 | Grad Max: 0.001159
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001038 | Grad Max: 0.001038
[GRADIENT NORM TOTAL] 2.8074

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.709
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54404527 0.4559547 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.070
[MASKS] A(Pass/Fail): 683/1365 | B: 632/1416 | C: 520/1528
[LOSS Ex1] A: 0.64350 | B: 0.62726 | C: 0.62199
[LOGITS Ex2 A] Mean Abs: 2.088 | Max: 6.667
[LOSS Ex2] A: 0.12463 | B: 0.33597 | C: 0.23442
** [JOINT LOSS] ** : 0.862596
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004589 | Grad Max: 0.169253
  -> Layer: shared_layers.0.bias | Grad Mean: 0.119903 | Grad Max: 0.829351
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005471
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003216 | Grad Max: 0.003216
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001003 | Grad Max: 0.338523
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017179 | Grad Max: 1.903362
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.004458
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005651 | Grad Max: 0.038873
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000181
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001306 | Grad Max: 0.004165
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000108
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000329 | Grad Max: 0.001016
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000397 | Grad Max: 0.001346
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006159 | Grad Max: 0.006159
[GRADIENT NORM TOTAL] 3.6282

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.888
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.76572686 0.2342731 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.074
[MASKS] A(Pass/Fail): 745/1303 | B: 636/1412 | C: 542/1506
[LOSS Ex1] A: 0.63740 | B: 0.62287 | C: 0.62041
[LOGITS Ex2 A] Mean Abs: 2.156 | Max: 6.798
[LOSS Ex2] A: 0.10866 | B: 0.32407 | C: 0.24640
** [JOINT LOSS] ** : 0.853272
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002941 | Grad Max: 0.120071
  -> Layer: shared_layers.0.bias | Grad Mean: 0.261785 | Grad Max: 1.572464
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002277 | Grad Max: 0.006240
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002723 | Grad Max: 0.002723
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001885 | Grad Max: 0.382930
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034298 | Grad Max: 2.133301
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000188 | Grad Max: 0.008681
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013839 | Grad Max: 0.095937
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000239
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002552 | Grad Max: 0.005712
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000116
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000635 | Grad Max: 0.001715
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000415 | Grad Max: 0.001550
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010148 | Grad Max: 0.010148
[GRADIENT NORM TOTAL] 6.4559

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.985
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007525  0.49924746] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.073
[MASKS] A(Pass/Fail): 716/1332 | B: 594/1262 | C: 528/1520
[LOSS Ex1] A: 0.64414 | B: 0.62718 | C: 0.62085
[LOGITS Ex2 A] Mean Abs: 2.147 | Max: 6.543
[LOSS Ex2] A: 0.10511 | B: 0.32550 | C: 0.23383
** [JOINT LOSS] ** : 0.852205
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005105 | Grad Max: 0.216962
  -> Layer: shared_layers.0.bias | Grad Mean: 0.117907 | Grad Max: 0.410725
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.005352
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003552 | Grad Max: 0.003552
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000979 | Grad Max: 0.217107
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015268 | Grad Max: 1.114668
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.003283
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002678 | Grad Max: 0.024314
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000184
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000493 | Grad Max: 0.002906
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000075
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000133 | Grad Max: 0.000732
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000442 | Grad Max: 0.001087
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001181 | Grad Max: 0.001181
[GRADIENT NORM TOTAL] 3.1372

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.691
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7069754  0.29302457] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.074
[MASKS] A(Pass/Fail): 709/1339 | B: 643/1405 | C: 527/1521
[LOSS Ex1] A: 0.63985 | B: 0.62641 | C: 0.62378
[LOGITS Ex2 A] Mean Abs: 2.130 | Max: 5.833
[LOSS Ex2] A: 0.13155 | B: 0.34427 | C: 0.23998
** [JOINT LOSS] ** : 0.868611
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005086 | Grad Max: 0.146817
  -> Layer: shared_layers.0.bias | Grad Mean: 0.370512 | Grad Max: 1.976229
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.006333
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007205 | Grad Max: 0.007205
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.708533
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041581 | Grad Max: 3.948724
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000248 | Grad Max: 0.008499
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018312 | Grad Max: 0.104553
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000325
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003679 | Grad Max: 0.008088
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000168
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000912 | Grad Max: 0.002298
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000662 | Grad Max: 0.001742
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015748 | Grad Max: 0.015748
[GRADIENT NORM TOTAL] 8.9699

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.799
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6256767  0.37432334] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.074
[MASKS] A(Pass/Fail): 588/1028 | B: 632/1416 | C: 534/1514
[LOSS Ex1] A: 0.63808 | B: 0.62705 | C: 0.61909
[LOGITS Ex2 A] Mean Abs: 2.209 | Max: 7.999
[LOSS Ex2] A: 0.11859 | B: 0.33739 | C: 0.23940
** [JOINT LOSS] ** : 0.859863
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003507 | Grad Max: 0.135279
  -> Layer: shared_layers.0.bias | Grad Mean: 0.211251 | Grad Max: 0.934273
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006385
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008261 | Grad Max: 0.008261
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001143 | Grad Max: 0.585912
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019585 | Grad Max: 3.274876
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.003755
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003484 | Grad Max: 0.035815
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000131
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000460 | Grad Max: 0.002661
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000057
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000108 | Grad Max: 0.000600
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000302 | Grad Max: 0.000889
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001728 | Grad Max: 0.001728
[GRADIENT NORM TOTAL] 6.0334

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.987
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5075193  0.49248073] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 714/1334 | B: 636/1412 | C: 565/1483
[LOSS Ex1] A: 0.63870 | B: 0.62266 | C: 0.61626
[LOGITS Ex2 A] Mean Abs: 2.201 | Max: 8.188
[LOSS Ex2] A: 0.11567 | B: 0.32072 | C: 0.24757
** [JOINT LOSS] ** : 0.853865
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009430 | Grad Max: 0.390026
  -> Layer: shared_layers.0.bias | Grad Mean: 0.406245 | Grad Max: 1.301687
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002326 | Grad Max: 0.005647
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000231 | Grad Max: 0.000231
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002881 | Grad Max: 0.405355
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051154 | Grad Max: 2.269688
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000356 | Grad Max: 0.011704
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025557 | Grad Max: 0.128815
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000484
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005467 | Grad Max: 0.011713
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000238
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001342 | Grad Max: 0.003126
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000907 | Grad Max: 0.002311
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021890 | Grad Max: 0.021890
[GRADIENT NORM TOTAL] 8.4429

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.931
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50978655 0.49021348] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.074
[MASKS] A(Pass/Fail): 708/1340 | B: 594/1262 | C: 349/1027
[LOSS Ex1] A: 0.63555 | B: 0.62698 | C: 0.62340
[LOGITS Ex2 A] Mean Abs: 2.197 | Max: 6.716
[LOSS Ex2] A: 0.12434 | B: 0.32359 | C: 0.24706
** [JOINT LOSS] ** : 0.860309
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006341 | Grad Max: 0.260362
  -> Layer: shared_layers.0.bias | Grad Mean: 0.104399 | Grad Max: 0.364072
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002264 | Grad Max: 0.006650
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006796 | Grad Max: 0.006796
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001127 | Grad Max: 0.268936
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018441 | Grad Max: 1.390810
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.006487
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008124 | Grad Max: 0.057056
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000273
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001952 | Grad Max: 0.005241
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000105
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000524 | Grad Max: 0.001424
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000427 | Grad Max: 0.001349
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009576 | Grad Max: 0.009576
[GRADIENT NORM TOTAL] 3.0979

[EPOCH SUMMARY] Train Loss: 0.8599

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8481 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 131/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5054642 0.4945358] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.074
[MASKS] A(Pass/Fail): 715/1333 | B: 644/1404 | C: 547/1501
[LOSS Ex1] A: 0.63454 | B: 0.62621 | C: 0.61786
[LOGITS Ex2 A] Mean Abs: 2.132 | Max: 6.579
[LOSS Ex2] A: 0.12677 | B: 0.36912 | C: 0.22496
** [JOINT LOSS] ** : 0.866490
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005200 | Grad Max: 0.228726
  -> Layer: shared_layers.0.bias | Grad Mean: 0.605777 | Grad Max: 3.078303
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002296 | Grad Max: 0.006558
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002464 | Grad Max: 0.002464
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003947 | Grad Max: 0.655362
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.073270 | Grad Max: 3.653754
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000489 | Grad Max: 0.021675
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036749 | Grad Max: 0.220992
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000602
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007113 | Grad Max: 0.014594
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000265
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001764 | Grad Max: 0.004001
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001192 | Grad Max: 0.002254
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030424 | Grad Max: 0.030424
[GRADIENT NORM TOTAL] 13.6787

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.794
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50171345 0.49828658] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.073
[MASKS] A(Pass/Fail): 683/1365 | B: 633/1415 | C: 519/1529
[LOSS Ex1] A: 0.64304 | B: 0.62685 | C: 0.61898
[LOGITS Ex2 A] Mean Abs: 2.102 | Max: 6.423
[LOSS Ex2] A: 0.11743 | B: 0.36989 | C: 0.24620
** [JOINT LOSS] ** : 0.874128
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008806 | Grad Max: 0.209014
  -> Layer: shared_layers.0.bias | Grad Mean: 0.666289 | Grad Max: 2.630035
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.005859
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006311 | Grad Max: 0.006311
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004402 | Grad Max: 0.546635
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.081938 | Grad Max: 3.068626
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000598 | Grad Max: 0.021288
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.044514 | Grad Max: 0.249931
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000739
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009014 | Grad Max: 0.018322
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000380
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002256 | Grad Max: 0.005372
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001668 | Grad Max: 0.003164
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040131 | Grad Max: 0.040131
[GRADIENT NORM TOTAL] 13.8758

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.714
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54405946 0.45594054] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.071
[MASKS] A(Pass/Fail): 683/1365 | B: 636/1412 | C: 519/1529
[LOSS Ex1] A: 0.64326 | B: 0.62246 | C: 0.62469
[LOGITS Ex2 A] Mean Abs: 2.129 | Max: 5.715
[LOSS Ex2] A: 0.12426 | B: 0.33117 | C: 0.25254
** [JOINT LOSS] ** : 0.866126
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006761 | Grad Max: 0.259019
  -> Layer: shared_layers.0.bias | Grad Mean: 0.205778 | Grad Max: 0.869356
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.005602
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006855 | Grad Max: 0.006855
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001669 | Grad Max: 0.395064
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029535 | Grad Max: 2.219909
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000205 | Grad Max: 0.006514
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014543 | Grad Max: 0.075817
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000348
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003221 | Grad Max: 0.007582
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000172
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000807 | Grad Max: 0.002187
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000658 | Grad Max: 0.001661
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014131 | Grad Max: 0.014131
[GRADIENT NORM TOTAL] 5.0476

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.894
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.76713264 0.23286738] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.075
[MASKS] A(Pass/Fail): 745/1303 | B: 594/1262 | C: 510/1538
[LOSS Ex1] A: 0.63714 | B: 0.62679 | C: 0.62201
[LOGITS Ex2 A] Mean Abs: 2.226 | Max: 6.699
[LOSS Ex2] A: 0.12704 | B: 0.33620 | C: 0.26113
** [JOINT LOSS] ** : 0.870100
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006006 | Grad Max: 0.308051
  -> Layer: shared_layers.0.bias | Grad Mean: 0.761881 | Grad Max: 3.888122
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002224 | Grad Max: 0.006292
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005207 | Grad Max: 0.005207
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004953 | Grad Max: 0.896494
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.092905 | Grad Max: 5.000305
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.021076
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.046134 | Grad Max: 0.272610
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000760
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009007 | Grad Max: 0.019214
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000324
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002232 | Grad Max: 0.005075
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001484 | Grad Max: 0.002941
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037505 | Grad Max: 0.037505
[GRADIENT NORM TOTAL] 17.5215

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.991
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007691  0.49923092] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.073
[MASKS] A(Pass/Fail): 717/1331 | B: 644/1404 | C: 545/1503
[LOSS Ex1] A: 0.64391 | B: 0.62603 | C: 0.61906
[LOGITS Ex2 A] Mean Abs: 2.248 | Max: 5.655
[LOSS Ex2] A: 0.11856 | B: 0.38451 | C: 0.21704
** [JOINT LOSS] ** : 0.869704
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007723 | Grad Max: 0.427495
  -> Layer: shared_layers.0.bias | Grad Mean: 1.094741 | Grad Max: 5.554728
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002114 | Grad Max: 0.005599
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004117 | Grad Max: 0.004117
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007040 | Grad Max: 1.120743
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.132081 | Grad Max: 6.234292
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000884 | Grad Max: 0.034544
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.066979 | Grad Max: 0.401497
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000106 | Grad Max: 0.001110
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013130 | Grad Max: 0.026967
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000499
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003253 | Grad Max: 0.007453
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002164 | Grad Max: 0.003874
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.055047 | Grad Max: 0.055047
[GRADIENT NORM TOTAL] 24.6067

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.695
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.70786875 0.29213125] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.074
[MASKS] A(Pass/Fail): 710/1338 | B: 633/1415 | C: 535/1513
[LOSS Ex1] A: 0.63961 | B: 0.62668 | C: 0.62456
[LOGITS Ex2 A] Mean Abs: 2.229 | Max: 6.359
[LOSS Ex2] A: 0.13537 | B: 0.37197 | C: 0.29248
** [JOINT LOSS] ** : 0.896889
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006680 | Grad Max: 0.364169
  -> Layer: shared_layers.0.bias | Grad Mean: 0.976795 | Grad Max: 4.846680
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005380
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006067 | Grad Max: 0.006067
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006230 | Grad Max: 1.077711
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.116566 | Grad Max: 6.015776
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000776 | Grad Max: 0.031053
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.058911 | Grad Max: 0.349642
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.000983
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011438 | Grad Max: 0.023892
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000456
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002768 | Grad Max: 0.006921
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001792 | Grad Max: 0.003597
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045057 | Grad Max: 0.045057
[GRADIENT NORM TOTAL] 22.1342

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.804
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62613225 0.37386772] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.075
[MASKS] A(Pass/Fail): 588/1028 | B: 636/1412 | C: 558/1490
[LOSS Ex1] A: 0.63785 | B: 0.62231 | C: 0.61823
[LOGITS Ex2 A] Mean Abs: 2.250 | Max: 6.558
[LOSS Ex2] A: 0.12172 | B: 0.31792 | C: 0.24101
** [JOINT LOSS] ** : 0.853014
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004760 | Grad Max: 0.167603
  -> Layer: shared_layers.0.bias | Grad Mean: 0.304874 | Grad Max: 1.498444
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002239 | Grad Max: 0.006018
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003973 | Grad Max: 0.003973
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.461089
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034482 | Grad Max: 2.591306
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000179 | Grad Max: 0.007730
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012775 | Grad Max: 0.090520
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000238
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002283 | Grad Max: 0.005934
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000107
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000552 | Grad Max: 0.001543
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000325 | Grad Max: 0.001051
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008340 | Grad Max: 0.008340
[GRADIENT NORM TOTAL] 7.1397

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.993
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.507444   0.49255595] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.075
[MASKS] A(Pass/Fail): 716/1332 | B: 594/1262 | C: 551/1497
[LOSS Ex1] A: 0.63848 | B: 0.62664 | C: 0.62190
[LOGITS Ex2 A] Mean Abs: 2.173 | Max: 7.412
[LOSS Ex2] A: 0.12079 | B: 0.35368 | C: 0.24047
** [JOINT LOSS] ** : 0.867321
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011709 | Grad Max: 0.343394
  -> Layer: shared_layers.0.bias | Grad Mean: 0.743006 | Grad Max: 3.109020
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002174 | Grad Max: 0.005945
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001950 | Grad Max: 0.001950
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005051 | Grad Max: 0.592270
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.092833 | Grad Max: 3.284600
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000682 | Grad Max: 0.025994
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.050605 | Grad Max: 0.295520
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000803
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010403 | Grad Max: 0.020617
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000446
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002587 | Grad Max: 0.006358
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001784 | Grad Max: 0.003245
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043961 | Grad Max: 0.043961
[GRADIENT NORM TOTAL] 15.2100

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.936
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5097895  0.49021047] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 709/1339 | B: 644/1404 | C: 530/1518
[LOSS Ex1] A: 0.63533 | B: 0.62590 | C: 0.62192
[LOGITS Ex2 A] Mean Abs: 2.163 | Max: 6.707
[LOSS Ex2] A: 0.12535 | B: 0.39678 | C: 0.24551
** [JOINT LOSS] ** : 0.883596
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009918 | Grad Max: 0.326858
  -> Layer: shared_layers.0.bias | Grad Mean: 1.010631 | Grad Max: 4.344743
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.006561
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001310 | Grad Max: 0.001310
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006642 | Grad Max: 1.052877
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.123838 | Grad Max: 5.850938
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000838 | Grad Max: 0.032651
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.063493 | Grad Max: 0.368448
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001056
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012842 | Grad Max: 0.026088
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000513
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003201 | Grad Max: 0.007826
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002181 | Grad Max: 0.004371
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.054662 | Grad Max: 0.054662
[GRADIENT NORM TOTAL] 22.0122

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.966
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.505483   0.49451706] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.074
[MASKS] A(Pass/Fail): 715/1333 | B: 633/1415 | C: 562/1486
[LOSS Ex1] A: 0.63433 | B: 0.62656 | C: 0.62178
[LOGITS Ex2 A] Mean Abs: 2.126 | Max: 8.149
[LOSS Ex2] A: 0.13212 | B: 0.37102 | C: 0.24388
** [JOINT LOSS] ** : 0.876558
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006164 | Grad Max: 0.271388
  -> Layer: shared_layers.0.bias | Grad Mean: 0.757032 | Grad Max: 3.488138
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.006398
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004747 | Grad Max: 0.004747
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004671 | Grad Max: 0.849158
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.087697 | Grad Max: 4.710185
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000603 | Grad Max: 0.024559
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.046038 | Grad Max: 0.282265
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000700
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009094 | Grad Max: 0.018066
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000355
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002289 | Grad Max: 0.005494
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001565 | Grad Max: 0.002992
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040048 | Grad Max: 0.040048
[GRADIENT NORM TOTAL] 16.2859

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.798
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016482  0.49835178] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.073
[MASKS] A(Pass/Fail): 684/1364 | B: 636/1412 | C: 583/1465
[LOSS Ex1] A: 0.64285 | B: 0.62219 | C: 0.61443
[LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.064
[LOSS Ex2] A: 0.11318 | B: 0.32249 | C: 0.23589
** [JOINT LOSS] ** : 0.850342
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005497 | Grad Max: 0.162857
  -> Layer: shared_layers.0.bias | Grad Mean: 0.217896 | Grad Max: 0.928172
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002244 | Grad Max: 0.006809
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012153 | Grad Max: 0.012153
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001612 | Grad Max: 0.515708
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029305 | Grad Max: 2.863945
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.006451
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012210 | Grad Max: 0.070088
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000291
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002628 | Grad Max: 0.006695
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000131
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000636 | Grad Max: 0.001670
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000423 | Grad Max: 0.001427
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009928 | Grad Max: 0.009928
[GRADIENT NORM TOTAL] 5.7190

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.717
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439659 0.4560341] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.071
[MASKS] A(Pass/Fail): 683/1365 | B: 595/1261 | C: 543/1505
[LOSS Ex1] A: 0.64308 | B: 0.62653 | C: 0.61821
[LOGITS Ex2 A] Mean Abs: 2.161 | Max: 5.585
[LOSS Ex2] A: 0.12486 | B: 0.32376 | C: 0.24129
** [JOINT LOSS] ** : 0.859243
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008373 | Grad Max: 0.232125
  -> Layer: shared_layers.0.bias | Grad Mean: 0.445471 | Grad Max: 1.822768
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005799
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004619 | Grad Max: 0.004619
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003019 | Grad Max: 0.413182
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055211 | Grad Max: 2.318281
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000369 | Grad Max: 0.014503
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027135 | Grad Max: 0.168504
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000515
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005668 | Grad Max: 0.012512
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000251
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001413 | Grad Max: 0.003416
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001022 | Grad Max: 0.002502
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024536 | Grad Max: 0.024536
[GRADIENT NORM TOTAL] 9.4138

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.897
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.76804686 0.23195307] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.075
[MASKS] A(Pass/Fail): 745/1303 | B: 644/1404 | C: 561/1487
[LOSS Ex1] A: 0.63695 | B: 0.62580 | C: 0.62294
[LOGITS Ex2 A] Mean Abs: 2.180 | Max: 6.880
[LOSS Ex2] A: 0.11732 | B: 0.33878 | C: 0.26307
** [JOINT LOSS] ** : 0.868288
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007626 | Grad Max: 0.258203
  -> Layer: shared_layers.0.bias | Grad Mean: 0.259585 | Grad Max: 1.171643
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.006400
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005014 | Grad Max: 0.005014
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001995 | Grad Max: 0.341912
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035156 | Grad Max: 1.872692
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000266 | Grad Max: 0.010168
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018878 | Grad Max: 0.115104
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000379
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004061 | Grad Max: 0.008820
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000186
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001013 | Grad Max: 0.002544
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000696 | Grad Max: 0.001660
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016386 | Grad Max: 0.016386
[GRADIENT NORM TOTAL] 5.7719

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.995
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007297  0.49927035] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.074
[MASKS] A(Pass/Fail): 717/1331 | B: 633/1415 | C: 348/1028
[LOSS Ex1] A: 0.64373 | B: 0.62645 | C: 0.62796
[LOGITS Ex2 A] Mean Abs: 2.129 | Max: 6.268
[LOSS Ex2] A: 0.10989 | B: 0.35500 | C: 0.25062
** [JOINT LOSS] ** : 0.871219
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005532 | Grad Max: 0.167078
  -> Layer: shared_layers.0.bias | Grad Mean: 0.492459 | Grad Max: 2.107852
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.005974
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010693 | Grad Max: 0.010693
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003216 | Grad Max: 0.444117
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059802 | Grad Max: 2.494663
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000403 | Grad Max: 0.014627
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030395 | Grad Max: 0.168913
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000535
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006147 | Grad Max: 0.012922
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000235
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001548 | Grad Max: 0.003520
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001100 | Grad Max: 0.002218
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027291 | Grad Max: 0.027291
[GRADIENT NORM TOTAL] 10.5824

[EPOCH SUMMARY] Train Loss: 0.8695

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8521 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 132/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.698
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.70847976 0.29152027] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.075
[MASKS] A(Pass/Fail): 710/1338 | B: 636/1412 | C: 536/1512
[LOSS Ex1] A: 0.63943 | B: 0.62208 | C: 0.62083
[LOGITS Ex2 A] Mean Abs: 2.095 | Max: 6.435
[LOSS Ex2] A: 0.13600 | B: 0.35074 | C: 0.24053
** [JOINT LOSS] ** : 0.869872
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009998 | Grad Max: 0.268569
  -> Layer: shared_layers.0.bias | Grad Mean: 0.678049 | Grad Max: 2.774681
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.006312
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002540 | Grad Max: 0.002540
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004383 | Grad Max: 0.585236
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.081876 | Grad Max: 3.172708
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000594 | Grad Max: 0.020704
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.044458 | Grad Max: 0.251811
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000706
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008927 | Grad Max: 0.017692
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000376
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002227 | Grad Max: 0.005474
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001532 | Grad Max: 0.002888
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037863 | Grad Max: 0.037863
[GRADIENT NORM TOTAL] 13.7005

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.807
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62640864 0.37359133] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.075
[MASKS] A(Pass/Fail): 588/1028 | B: 595/1261 | C: 537/1511
[LOSS Ex1] A: 0.63767 | B: 0.62643 | C: 0.62431
[LOGITS Ex2 A] Mean Abs: 2.149 | Max: 9.479
[LOSS Ex2] A: 0.12142 | B: 0.33094 | C: 0.24996
** [JOINT LOSS] ** : 0.863576
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006406 | Grad Max: 0.196214
  -> Layer: shared_layers.0.bias | Grad Mean: 0.301290 | Grad Max: 1.408984
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.005539
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006383 | Grad Max: 0.006383
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.249072
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037911 | Grad Max: 1.275433
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.012288
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022525 | Grad Max: 0.137987
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000439
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004621 | Grad Max: 0.010255
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000204
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001163 | Grad Max: 0.002910
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000864 | Grad Max: 0.002148
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020155 | Grad Max: 0.020155
[GRADIENT NORM TOTAL] 6.1769

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.996
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50749743 0.49250254] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.075
[MASKS] A(Pass/Fail): 716/1332 | B: 644/1404 | C: 526/1522
[LOSS Ex1] A: 0.63831 | B: 0.62570 | C: 0.62579
[LOGITS Ex2 A] Mean Abs: 2.199 | Max: 7.979
[LOSS Ex2] A: 0.11365 | B: 0.35558 | C: 0.27410
** [JOINT LOSS] ** : 0.877709
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005207 | Grad Max: 0.283034
  -> Layer: shared_layers.0.bias | Grad Mean: 0.768818 | Grad Max: 3.789115
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005893
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000079 | Grad Max: 0.000079
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004835 | Grad Max: 0.837824
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.090562 | Grad Max: 4.668731
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000617 | Grad Max: 0.022483
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047222 | Grad Max: 0.261635
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000788
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009352 | Grad Max: 0.018615
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000360
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002309 | Grad Max: 0.005497
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001423 | Grad Max: 0.002754
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037430 | Grad Max: 0.037430
[GRADIENT NORM TOTAL] 16.9620

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.940
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5096875  0.49031255] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 709/1339 | B: 633/1415 | C: 561/1487
[LOSS Ex1] A: 0.63517 | B: 0.62635 | C: 0.61806
[LOGITS Ex2 A] Mean Abs: 2.210 | Max: 6.627
[LOSS Ex2] A: 0.13309 | B: 0.39149 | C: 0.27979
** [JOINT LOSS] ** : 0.894647
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009642 | Grad Max: 0.414203
  -> Layer: shared_layers.0.bias | Grad Mean: 1.228036 | Grad Max: 5.336411
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.006537
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003409 | Grad Max: 0.003409
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007656 | Grad Max: 1.341062
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.143661 | Grad Max: 7.487618
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000985 | Grad Max: 0.040052
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.075725 | Grad Max: 0.453847
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000121 | Grad Max: 0.001234
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.015211 | Grad Max: 0.031914
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000590
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003805 | Grad Max: 0.009365
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002510 | Grad Max: 0.005102
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.064373 | Grad Max: 0.064373
[GRADIENT NORM TOTAL] 26.2670

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.969
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50556856 0.49443144] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 715/1333 | B: 636/1412 | C: 555/1493
[LOSS Ex1] A: 0.63417 | B: 0.62199 | C: 0.61971
[LOGITS Ex2 A] Mean Abs: 2.176 | Max: 7.467
[LOSS Ex2] A: 0.14177 | B: 0.35442 | C: 0.25262
** [JOINT LOSS] ** : 0.874895
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010185 | Grad Max: 0.355247
  -> Layer: shared_layers.0.bias | Grad Mean: 1.084433 | Grad Max: 4.566019
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002354 | Grad Max: 0.006414
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004761 | Grad Max: 0.004761
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006919 | Grad Max: 1.110409
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.129189 | Grad Max: 6.172784
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000904 | Grad Max: 0.035714
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.068784 | Grad Max: 0.414244
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001085
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013878 | Grad Max: 0.027908
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000508
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003498 | Grad Max: 0.007838
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002303 | Grad Max: 0.004216
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.059194 | Grad Max: 0.059194
[GRADIENT NORM TOTAL] 22.9894

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.801
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50166225 0.4983377 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.073
[MASKS] A(Pass/Fail): 684/1364 | B: 596/1260 | C: 564/1484
[LOSS Ex1] A: 0.64270 | B: 0.62634 | C: 0.61523
[LOGITS Ex2 A] Mean Abs: 2.114 | Max: 6.653
[LOSS Ex2] A: 0.11290 | B: 0.32188 | C: 0.22701
** [JOINT LOSS] ** : 0.848684
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002835 | Grad Max: 0.092945
  -> Layer: shared_layers.0.bias | Grad Mean: 0.243710 | Grad Max: 1.111518
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.005834
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000472 | Grad Max: 0.000472
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001775 | Grad Max: 0.438802
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032200 | Grad Max: 2.453110
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000185 | Grad Max: 0.008347
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014137 | Grad Max: 0.093400
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000250
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002751 | Grad Max: 0.006075
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000135
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000727 | Grad Max: 0.001865
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000509 | Grad Max: 0.001732
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012997 | Grad Max: 0.012997
[GRADIENT NORM TOTAL] 6.3057

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.720
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5440134 0.4559866] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.071
[MASKS] A(Pass/Fail): 683/1365 | B: 646/1402 | C: 554/1494
[LOSS Ex1] A: 0.64294 | B: 0.62562 | C: 0.62132
[LOGITS Ex2 A] Mean Abs: 2.033 | Max: 6.360
[LOSS Ex2] A: 0.12643 | B: 0.36622 | C: 0.25792
** [JOINT LOSS] ** : 0.880148
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010411 | Grad Max: 0.267635
  -> Layer: shared_layers.0.bias | Grad Mean: 0.789341 | Grad Max: 3.667359
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.006595
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013742 | Grad Max: 0.013742
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005078 | Grad Max: 0.716664
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.094715 | Grad Max: 4.043537
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000639 | Grad Max: 0.021948
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048181 | Grad Max: 0.277578
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000806
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009878 | Grad Max: 0.018903
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000359
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002519 | Grad Max: 0.005779
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001771 | Grad Max: 0.003130
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044228 | Grad Max: 0.044228
[GRADIENT NORM TOTAL] 16.9970

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.900
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.76881343 0.23118658] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.075
[MASKS] A(Pass/Fail): 745/1303 | B: 633/1415 | C: 572/1476
[LOSS Ex1] A: 0.63681 | B: 0.62627 | C: 0.61591
[LOGITS Ex2 A] Mean Abs: 2.037 | Max: 6.377
[LOSS Ex2] A: 0.12577 | B: 0.39212 | C: 0.24864
** [JOINT LOSS] ** : 0.881843
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010212 | Grad Max: 0.344785
  -> Layer: shared_layers.0.bias | Grad Mean: 1.042530 | Grad Max: 4.641877
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.006140
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002091 | Grad Max: 0.002091
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006557 | Grad Max: 0.802334
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.123613 | Grad Max: 4.535263
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000856 | Grad Max: 0.031929
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.065524 | Grad Max: 0.366519
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000107 | Grad Max: 0.001057
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013255 | Grad Max: 0.027004
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000520
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003340 | Grad Max: 0.008082
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002207 | Grad Max: 0.004202
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.056527 | Grad Max: 0.056527
[GRADIENT NORM TOTAL] 22.2275

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.998
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50069433 0.49930567] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.074
[MASKS] A(Pass/Fail): 717/1331 | B: 637/1411 | C: 595/1453
[LOSS Ex1] A: 0.64361 | B: 0.62191 | C: 0.61361
[LOGITS Ex2 A] Mean Abs: 2.060 | Max: 6.301
[LOSS Ex2] A: 0.10714 | B: 0.36859 | C: 0.25087
** [JOINT LOSS] ** : 0.868574
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008627 | Grad Max: 0.284707
  -> Layer: shared_layers.0.bias | Grad Mean: 0.867000 | Grad Max: 3.791410
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.005913
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005372 | Grad Max: 0.005372
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005457 | Grad Max: 0.973554
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.102103 | Grad Max: 5.415605
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000692 | Grad Max: 0.028324
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.052829 | Grad Max: 0.332604
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000815
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010706 | Grad Max: 0.021393
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000409
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002693 | Grad Max: 0.006475
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001789 | Grad Max: 0.003396
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046071 | Grad Max: 0.046071
[GRADIENT NORM TOTAL] 18.7556

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.700
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7089249  0.29107514] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.075
[MASKS] A(Pass/Fail): 711/1337 | B: 597/1259 | C: 518/1530
[LOSS Ex1] A: 0.63930 | B: 0.62626 | C: 0.62546
[LOGITS Ex2 A] Mean Abs: 2.082 | Max: 6.131
[LOSS Ex2] A: 0.12827 | B: 0.33235 | C: 0.25198
** [JOINT LOSS] ** : 0.867877
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003138 | Grad Max: 0.125322
  -> Layer: shared_layers.0.bias | Grad Mean: 0.346150 | Grad Max: 1.662324
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005751
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000918 | Grad Max: 0.000918
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001977 | Grad Max: 0.564175
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036423 | Grad Max: 3.154107
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000212 | Grad Max: 0.008660
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015903 | Grad Max: 0.084697
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000323
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003105 | Grad Max: 0.007037
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000170
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000784 | Grad Max: 0.002449
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000514 | Grad Max: 0.001616
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013247 | Grad Max: 0.013247
[GRADIENT NORM TOTAL] 7.7391

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.810
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6265472  0.37345278] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.075
[MASKS] A(Pass/Fail): 588/1028 | B: 646/1402 | C: 530/1518
[LOSS Ex1] A: 0.63755 | B: 0.62554 | C: 0.62238
[LOGITS Ex2 A] Mean Abs: 2.178 | Max: 8.044
[LOSS Ex2] A: 0.13266 | B: 0.35318 | C: 0.24817
** [JOINT LOSS] ** : 0.873161
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008757 | Grad Max: 0.232645
  -> Layer: shared_layers.0.bias | Grad Mean: 0.688699 | Grad Max: 2.995794
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.005636
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006419 | Grad Max: 0.006419
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004521 | Grad Max: 0.582163
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.084143 | Grad Max: 3.250406
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000596 | Grad Max: 0.019259
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.045278 | Grad Max: 0.237821
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000759
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009219 | Grad Max: 0.018623
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000388
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002323 | Grad Max: 0.005856
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001484 | Grad Max: 0.003157
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038471 | Grad Max: 0.038471
[GRADIENT NORM TOTAL] 14.6503

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.999
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50749314 0.4925069 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.075
[MASKS] A(Pass/Fail): 716/1332 | B: 633/1415 | C: 554/1494
[LOSS Ex1] A: 0.63820 | B: 0.62620 | C: 0.61710
[LOGITS Ex2 A] Mean Abs: 2.149 | Max: 6.922
[LOSS Ex2] A: 0.13242 | B: 0.36986 | C: 0.26065
** [JOINT LOSS] ** : 0.881473
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014401 | Grad Max: 0.389565
  -> Layer: shared_layers.0.bias | Grad Mean: 1.107231 | Grad Max: 4.852305
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002230 | Grad Max: 0.006027
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004771 | Grad Max: 0.004771
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007325 | Grad Max: 0.957052
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.136072 | Grad Max: 5.371665
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000942 | Grad Max: 0.033127
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.071209 | Grad Max: 0.390454
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000118 | Grad Max: 0.001129
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.014628 | Grad Max: 0.028681
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000533
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003726 | Grad Max: 0.008557
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002427 | Grad Max: 0.004576
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.062607 | Grad Max: 0.062607
[GRADIENT NORM TOTAL] 23.7592

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.942
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50961703 0.49038297] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 709/1339 | B: 637/1411 | C: 529/1519
[LOSS Ex1] A: 0.63505 | B: 0.62184 | C: 0.62185
[LOGITS Ex2 A] Mean Abs: 2.160 | Max: 6.221
[LOSS Ex2] A: 0.13481 | B: 0.34087 | C: 0.23282
** [JOINT LOSS] ** : 0.862416
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011932 | Grad Max: 0.300555
  -> Layer: shared_layers.0.bias | Grad Mean: 0.863655 | Grad Max: 3.741196
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002323 | Grad Max: 0.006029
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003994 | Grad Max: 0.003994
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005653 | Grad Max: 0.806187
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.104908 | Grad Max: 4.514004
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000741 | Grad Max: 0.025928
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.056100 | Grad Max: 0.327325
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.000900
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011583 | Grad Max: 0.023211
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000451
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002976 | Grad Max: 0.006632
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001927 | Grad Max: 0.003914
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050166 | Grad Max: 0.050166
[GRADIENT NORM TOTAL] 18.4110

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.972
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50559366 0.49440628] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 715/1333 | B: 598/1258 | C: 354/1022
[LOSS Ex1] A: 0.63406 | B: 0.62619 | C: 0.62784
[LOGITS Ex2 A] Mean Abs: 2.080 | Max: 8.552
[LOSS Ex2] A: 0.13176 | B: 0.32188 | C: 0.24499
** [JOINT LOSS] ** : 0.862240
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003936 | Grad Max: 0.157110
  -> Layer: shared_layers.0.bias | Grad Mean: 0.105470 | Grad Max: 0.589201
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.007275
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000149 | Grad Max: 0.000149
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000953 | Grad Max: 0.244537
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016207 | Grad Max: 1.354697
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.003558
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004512 | Grad Max: 0.033995
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001050 | Grad Max: 0.003545
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000085
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000258 | Grad Max: 0.000851
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000420 | Grad Max: 0.001199
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003823 | Grad Max: 0.003823
[GRADIENT NORM TOTAL] 3.2813

[EPOCH SUMMARY] Train Loss: 0.8719

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8570 | Alpha: 0.5500
No improve count: 3/15

############################## EPOCH 133/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.803
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016183  0.49838167] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.074
[MASKS] A(Pass/Fail): 684/1364 | B: 647/1401 | C: 566/1482
[LOSS Ex1] A: 0.64260 | B: 0.62547 | C: 0.62162
[LOGITS Ex2 A] Mean Abs: 2.008 | Max: 5.907
[LOSS Ex2] A: 0.12863 | B: 0.38318 | C: 0.23008
** [JOINT LOSS] ** : 0.877195
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008345 | Grad Max: 0.282962
  -> Layer: shared_layers.0.bias | Grad Mean: 0.833683 | Grad Max: 3.694321
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.005620
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004785 | Grad Max: 0.004785
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005614 | Grad Max: 0.698312
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.104315 | Grad Max: 3.886066
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000703 | Grad Max: 0.025239
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.053791 | Grad Max: 0.306871
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.000818
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011080 | Grad Max: 0.021548
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000414
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002831 | Grad Max: 0.006762
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001836 | Grad Max: 0.003335
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048025 | Grad Max: 0.048025
[GRADIENT NORM TOTAL] 18.2798

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.722
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54404104 0.45595896] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.071
[MASKS] A(Pass/Fail): 683/1365 | B: 633/1415 | C: 558/1490
[LOSS Ex1] A: 0.64284 | B: 0.62613 | C: 0.61663
[LOGITS Ex2 A] Mean Abs: 1.965 | Max: 7.018
[LOSS Ex2] A: 0.13634 | B: 0.39764 | C: 0.25553
** [JOINT LOSS] ** : 0.891705
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.015609 | Grad Max: 0.392005
  -> Layer: shared_layers.0.bias | Grad Mean: 1.078942 | Grad Max: 4.337672
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.005770
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006575 | Grad Max: 0.006575
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007188 | Grad Max: 0.848087
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.133554 | Grad Max: 4.638885
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000951 | Grad Max: 0.031957
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.072255 | Grad Max: 0.382839
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000122 | Grad Max: 0.001127
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.014987 | Grad Max: 0.029205
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000574
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003832 | Grad Max: 0.008975
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002613 | Grad Max: 0.004995
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.065877 | Grad Max: 0.065877
[GRADIENT NORM TOTAL] 22.3570

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.903
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.76943624 0.23056376] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.075
[MASKS] A(Pass/Fail): 745/1303 | B: 637/1411 | C: 548/1500
[LOSS Ex1] A: 0.63670 | B: 0.62176 | C: 0.61839
[LOGITS Ex2 A] Mean Abs: 2.032 | Max: 6.268
[LOSS Ex2] A: 0.12496 | B: 0.34642 | C: 0.22619
** [JOINT LOSS] ** : 0.858139
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011047 | Grad Max: 0.282140
  -> Layer: shared_layers.0.bias | Grad Mean: 0.729472 | Grad Max: 2.867814
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002293 | Grad Max: 0.005889
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001469 | Grad Max: 0.001469
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004793 | Grad Max: 0.571273
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.088359 | Grad Max: 3.185843
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000626 | Grad Max: 0.022206
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047327 | Grad Max: 0.244631
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000770
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009910 | Grad Max: 0.019978
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000424
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002481 | Grad Max: 0.006558
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001615 | Grad Max: 0.003431
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040684 | Grad Max: 0.040684
[GRADIENT NORM TOTAL] 15.0714

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 1.001
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500706 0.499294] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.074
[MASKS] A(Pass/Fail): 717/1331 | B: 598/1258 | C: 573/1475
[LOSS Ex1] A: 0.64350 | B: 0.62611 | C: 0.61977
[LOGITS Ex2 A] Mean Abs: 2.090 | Max: 5.961
[LOSS Ex2] A: 0.10876 | B: 0.31711 | C: 0.24882
** [JOINT LOSS] ** : 0.854690
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004463 | Grad Max: 0.184434
  -> Layer: shared_layers.0.bias | Grad Mean: 0.139690 | Grad Max: 0.751308
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.005628
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001745 | Grad Max: 0.001745
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001121 | Grad Max: 0.268528
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017886 | Grad Max: 1.497478
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.003333
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002727 | Grad Max: 0.024796
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000141
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000320 | Grad Max: 0.002145
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000092 | Grad Max: 0.000565
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000450 | Grad Max: 0.001180
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000184 | Grad Max: 0.000184
[GRADIENT NORM TOTAL] 3.8267

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.703
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7093755 0.2906245] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.075
[MASKS] A(Pass/Fail): 711/1337 | B: 647/1401 | C: 537/1511
[LOSS Ex1] A: 0.63919 | B: 0.62539 | C: 0.62333
[LOGITS Ex2 A] Mean Abs: 2.117 | Max: 5.657
[LOSS Ex2] A: 0.12988 | B: 0.35478 | C: 0.26834
** [JOINT LOSS] ** : 0.880304
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005836 | Grad Max: 0.256267
  -> Layer: shared_layers.0.bias | Grad Mean: 0.710384 | Grad Max: 3.354811
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.006327
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000842 | Grad Max: 0.000842
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004423 | Grad Max: 0.786702
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.081917 | Grad Max: 4.398643
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000563 | Grad Max: 0.022365
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.043102 | Grad Max: 0.264595
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000728
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008545 | Grad Max: 0.016842
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000357
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002192 | Grad Max: 0.005750
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001354 | Grad Max: 0.002899
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036675 | Grad Max: 0.036675
[GRADIENT NORM TOTAL] 15.2047

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.812
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62681 0.37319] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.075
[MASKS] A(Pass/Fail): 588/1028 | B: 633/1415 | C: 564/1484
[LOSS Ex1] A: 0.63743 | B: 0.62604 | C: 0.61836
[LOGITS Ex2 A] Mean Abs: 2.155 | Max: 9.296
[LOSS Ex2] A: 0.11506 | B: 0.36004 | C: 0.27416
** [JOINT LOSS] ** : 0.877027
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006885 | Grad Max: 0.237537
  -> Layer: shared_layers.0.bias | Grad Mean: 0.753272 | Grad Max: 3.075963
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.006278
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010051 | Grad Max: 0.010051
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004676 | Grad Max: 0.645649
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.087901 | Grad Max: 3.590152
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000615 | Grad Max: 0.024671
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047385 | Grad Max: 0.294335
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000738
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009510 | Grad Max: 0.018615
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000389
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002403 | Grad Max: 0.006106
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001491 | Grad Max: 0.002918
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038808 | Grad Max: 0.038808
[GRADIENT NORM TOTAL] 15.6435

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 1.002
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50749063 0.49250937] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 717/1331 | B: 637/1411 | C: 527/1521
[LOSS Ex1] A: 0.63807 | B: 0.62167 | C: 0.62223
[LOGITS Ex2 A] Mean Abs: 2.111 | Max: 6.797
[LOSS Ex2] A: 0.11512 | B: 0.32270 | C: 0.24144
** [JOINT LOSS] ** : 0.853745
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004505 | Grad Max: 0.109322
  -> Layer: shared_layers.0.bias | Grad Mean: 0.358862 | Grad Max: 1.267490
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002354 | Grad Max: 0.006353
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013261 | Grad Max: 0.013261
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002466 | Grad Max: 0.356395
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045577 | Grad Max: 1.995037
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000324 | Grad Max: 0.011929
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024596 | Grad Max: 0.130720
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000366
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005113 | Grad Max: 0.010350
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000218
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001345 | Grad Max: 0.002919
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000907 | Grad Max: 0.002687
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024142 | Grad Max: 0.024142
[GRADIENT NORM TOTAL] 7.6463

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.945
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50963676 0.49036324] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 709/1339 | B: 598/1258 | C: 542/1506
[LOSS Ex1] A: 0.63492 | B: 0.62602 | C: 0.61958
[LOGITS Ex2 A] Mean Abs: 2.045 | Max: 5.505
[LOSS Ex2] A: 0.11670 | B: 0.34412 | C: 0.22981
** [JOINT LOSS] ** : 0.857047
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004186 | Grad Max: 0.213070
  -> Layer: shared_layers.0.bias | Grad Mean: 0.623315 | Grad Max: 2.835513
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002253 | Grad Max: 0.006086
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003153 | Grad Max: 0.003153
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003904 | Grad Max: 0.672458
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.073197 | Grad Max: 3.752946
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000490 | Grad Max: 0.018794
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.038102 | Grad Max: 0.217045
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000620
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007646 | Grad Max: 0.015828
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000319
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001932 | Grad Max: 0.004885
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001190 | Grad Max: 0.002733
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031800 | Grad Max: 0.031800
[GRADIENT NORM TOTAL] 13.5155

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.974
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056137  0.49438632] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 716/1332 | B: 647/1401 | C: 541/1507
[LOSS Ex1] A: 0.63392 | B: 0.62530 | C: 0.62061
[LOGITS Ex2 A] Mean Abs: 2.019 | Max: 7.388
[LOSS Ex2] A: 0.13409 | B: 0.39254 | C: 0.22166
** [JOINT LOSS] ** : 0.876041
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007403 | Grad Max: 0.313602
  -> Layer: shared_layers.0.bias | Grad Mean: 0.955535 | Grad Max: 4.262785
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002260 | Grad Max: 0.006654
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002491 | Grad Max: 0.002491
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006153 | Grad Max: 0.986255
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.115337 | Grad Max: 5.502318
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000776 | Grad Max: 0.029657
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.059945 | Grad Max: 0.342330
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.000911
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011970 | Grad Max: 0.023350
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000469
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003051 | Grad Max: 0.007302
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001901 | Grad Max: 0.003552
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050517 | Grad Max: 0.050517
[GRADIENT NORM TOTAL] 20.7859

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.805
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015696 0.4984303] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.074
[MASKS] A(Pass/Fail): 684/1364 | B: 633/1415 | C: 516/1532
[LOSS Ex1] A: 0.64248 | B: 0.62595 | C: 0.62505
[LOGITS Ex2 A] Mean Abs: 1.999 | Max: 6.158
[LOSS Ex2] A: 0.11269 | B: 0.38451 | C: 0.23100
** [JOINT LOSS] ** : 0.873895
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006310 | Grad Max: 0.283243
  -> Layer: shared_layers.0.bias | Grad Mean: 0.849842 | Grad Max: 3.645814
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002052 | Grad Max: 0.006227
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011994 | Grad Max: 0.011994
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005352 | Grad Max: 0.974500
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.100645 | Grad Max: 5.428516
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000696 | Grad Max: 0.027667
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.053754 | Grad Max: 0.326982
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000823
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010848 | Grad Max: 0.022049
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000439
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002795 | Grad Max: 0.006959
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001829 | Grad Max: 0.003624
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047806 | Grad Max: 0.047806
[GRADIENT NORM TOTAL] 18.2197

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.724
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439764  0.45602354] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.071
[MASKS] A(Pass/Fail): 683/1365 | B: 637/1411 | C: 554/1494
[LOSS Ex1] A: 0.64273 | B: 0.62159 | C: 0.62137
[LOGITS Ex2 A] Mean Abs: 2.028 | Max: 5.878
[LOSS Ex2] A: 0.12398 | B: 0.32779 | C: 0.25041
** [JOINT LOSS] ** : 0.862620
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004076 | Grad Max: 0.116352
  -> Layer: shared_layers.0.bias | Grad Mean: 0.215826 | Grad Max: 0.947965
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.006562
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011294 | Grad Max: 0.011294
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001428 | Grad Max: 0.598486
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024819 | Grad Max: 3.332857
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000136 | Grad Max: 0.006958
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009805 | Grad Max: 0.077276
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000173
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001774 | Grad Max: 0.004676
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000095
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000461 | Grad Max: 0.001257
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000394 | Grad Max: 0.001373
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008702 | Grad Max: 0.008702
[GRADIENT NORM TOTAL] 5.7023

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.905
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7700944  0.22990565] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.075
[MASKS] A(Pass/Fail): 745/1303 | B: 598/1258 | C: 545/1503
[LOSS Ex1] A: 0.63658 | B: 0.62594 | C: 0.61935
[LOGITS Ex2 A] Mean Abs: 2.120 | Max: 5.739
[LOSS Ex2] A: 0.13017 | B: 0.32662 | C: 0.25213
** [JOINT LOSS] ** : 0.863594
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.013396 | Grad Max: 0.381280
  -> Layer: shared_layers.0.bias | Grad Mean: 0.737213 | Grad Max: 3.098425
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002274 | Grad Max: 0.006511
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007008 | Grad Max: 0.007008
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005039 | Grad Max: 0.699982
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.092647 | Grad Max: 3.916414
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000637 | Grad Max: 0.020601
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047952 | Grad Max: 0.239413
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000807
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010144 | Grad Max: 0.019817
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000392
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002624 | Grad Max: 0.005902
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001759 | Grad Max: 0.003721
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043908 | Grad Max: 0.043908
[GRADIENT NORM TOTAL] 15.6191

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 1.003
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006981  0.49930185] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.074
[MASKS] A(Pass/Fail): 717/1331 | B: 647/1401 | C: 564/1484
[LOSS Ex1] A: 0.64339 | B: 0.62523 | C: 0.61734
[LOGITS Ex2 A] Mean Abs: 2.154 | Max: 5.860
[LOSS Ex2] A: 0.13730 | B: 0.37853 | C: 0.27545
** [JOINT LOSS] ** : 0.892410
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.013400 | Grad Max: 0.357914
  -> Layer: shared_layers.0.bias | Grad Mean: 1.034173 | Grad Max: 4.694898
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.006086
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004129 | Grad Max: 0.004129
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006912 | Grad Max: 1.006483
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.128723 | Grad Max: 5.599765
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000880 | Grad Max: 0.032282
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.067188 | Grad Max: 0.378535
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001092
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013854 | Grad Max: 0.028183
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000045 | Grad Max: 0.000561
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003562 | Grad Max: 0.008317
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002352 | Grad Max: 0.004332
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.059520 | Grad Max: 0.059520
[GRADIENT NORM TOTAL] 22.7050

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.705
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.70975107 0.29024896] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.075
[MASKS] A(Pass/Fail): 711/1337 | B: 633/1415 | C: 391/985
[LOSS Ex1] A: 0.63907 | B: 0.62588 | C: 0.61879
[LOGITS Ex2 A] Mean Abs: 2.103 | Max: 5.478
[LOSS Ex2] A: 0.14284 | B: 0.35883 | C: 0.27756
** [JOINT LOSS] ** : 0.887652
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007715 | Grad Max: 0.323765
  -> Layer: shared_layers.0.bias | Grad Mean: 0.843548 | Grad Max: 4.221925
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.005986
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001744 | Grad Max: 0.001744
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005527 | Grad Max: 0.949914
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.103360 | Grad Max: 5.261958
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000689 | Grad Max: 0.024535
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.053301 | Grad Max: 0.317356
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000905
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010860 | Grad Max: 0.021167
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000461
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002776 | Grad Max: 0.006751
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001776 | Grad Max: 0.003343
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045857 | Grad Max: 0.045857
[GRADIENT NORM TOTAL] 19.1604

[EPOCH SUMMARY] Train Loss: 0.8719

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8421 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8453 -> New: 0.8421)

############################## EPOCH 134/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.814
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6270077  0.37299225] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.075
[MASKS] A(Pass/Fail): 588/1028 | B: 637/1411 | C: 548/1500
[LOSS Ex1] A: 0.63731 | B: 0.62151 | C: 0.61471
[LOGITS Ex2 A] Mean Abs: 2.100 | Max: 8.837
[LOSS Ex2] A: 0.11983 | B: 0.31764 | C: 0.23900
** [JOINT LOSS] ** : 0.850004
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003151 | Grad Max: 0.080498
  -> Layer: shared_layers.0.bias | Grad Mean: 0.153064 | Grad Max: 0.830153
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002312 | Grad Max: 0.006678
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008902 | Grad Max: 0.008902
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001050 | Grad Max: 0.299800
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018229 | Grad Max: 1.658061
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000071 | Grad Max: 0.003648
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004786 | Grad Max: 0.035974
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000187
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000843 | Grad Max: 0.003264
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000075
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000221 | Grad Max: 0.000879
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000426 | Grad Max: 0.001066
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002690 | Grad Max: 0.002690
[GRADIENT NORM TOTAL] 4.0925

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.004
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074773 0.4925227] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 717/1331 | B: 598/1258 | C: 564/1484
[LOSS Ex1] A: 0.63796 | B: 0.62586 | C: 0.62007
[LOGITS Ex2 A] Mean Abs: 2.039 | Max: 6.986
[LOSS Ex2] A: 0.12119 | B: 0.34718 | C: 0.23190
** [JOINT LOSS] ** : 0.861389
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010179 | Grad Max: 0.267901
  -> Layer: shared_layers.0.bias | Grad Mean: 0.715793 | Grad Max: 2.847285
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002207 | Grad Max: 0.006240
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003901 | Grad Max: 0.003901
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004516 | Grad Max: 0.516004
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.083813 | Grad Max: 2.780766
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000607 | Grad Max: 0.019671
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.046347 | Grad Max: 0.257326
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000782
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009613 | Grad Max: 0.019036
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000402
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002457 | Grad Max: 0.005801
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001625 | Grad Max: 0.003294
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040425 | Grad Max: 0.040425
[GRADIENT NORM TOTAL] 14.1366

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.947
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.509579   0.49042097] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 710/1338 | B: 647/1401 | C: 561/1487
[LOSS Ex1] A: 0.63480 | B: 0.62516 | C: 0.61633
[LOGITS Ex2 A] Mean Abs: 2.004 | Max: 6.359
[LOSS Ex2] A: 0.13100 | B: 0.39022 | C: 0.22606
** [JOINT LOSS] ** : 0.874522
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010755 | Grad Max: 0.285420
  -> Layer: shared_layers.0.bias | Grad Mean: 0.904914 | Grad Max: 3.833302
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002325 | Grad Max: 0.006724
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005306 | Grad Max: 0.005306
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005775 | Grad Max: 0.739996
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.107448 | Grad Max: 4.117234
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000736 | Grad Max: 0.028349
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.056546 | Grad Max: 0.323814
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.000872
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011683 | Grad Max: 0.022946
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000447
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002995 | Grad Max: 0.007005
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001972 | Grad Max: 0.003447
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050114 | Grad Max: 0.050114
[GRADIENT NORM TOTAL] 18.9026

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.977
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056912  0.49430883] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 716/1332 | B: 633/1415 | C: 544/1504
[LOSS Ex1] A: 0.63380 | B: 0.62580 | C: 0.62197
[LOGITS Ex2 A] Mean Abs: 2.032 | Max: 7.480
[LOSS Ex2] A: 0.13494 | B: 0.36431 | C: 0.27927
** [JOINT LOSS] ** : 0.886699
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008372 | Grad Max: 0.226738
  -> Layer: shared_layers.0.bias | Grad Mean: 0.613035 | Grad Max: 2.683043
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.006540
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001726 | Grad Max: 0.001726
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004076 | Grad Max: 0.556365
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.076006 | Grad Max: 3.105547
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000529 | Grad Max: 0.022490
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.040672 | Grad Max: 0.244422
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000674
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008401 | Grad Max: 0.017050
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000338
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002212 | Grad Max: 0.005445
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001514 | Grad Max: 0.002928
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038650 | Grad Max: 0.038650
[GRADIENT NORM TOTAL] 13.0846

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.807
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016075 0.4983925] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.556 | Std: 0.074
[MASKS] A(Pass/Fail): 684/1364 | B: 637/1411 | C: 522/1526
[LOSS Ex1] A: 0.64236 | B: 0.62144 | C: 0.62010
[LOGITS Ex2 A] Mean Abs: 2.038 | Max: 5.308
[LOSS Ex2] A: 0.11935 | B: 0.31742 | C: 0.22558
** [JOINT LOSS] ** : 0.848753
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002379 | Grad Max: 0.086341
  -> Layer: shared_layers.0.bias | Grad Mean: 0.214661 | Grad Max: 1.133284
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.005686
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005579 | Grad Max: 0.005579
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001383 | Grad Max: 0.260317
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025310 | Grad Max: 1.464876
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000150 | Grad Max: 0.008111
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011301 | Grad Max: 0.091448
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000248
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002238 | Grad Max: 0.005522
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000107
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000575 | Grad Max: 0.001490
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000560 | Grad Max: 0.001694
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009289 | Grad Max: 0.009289
[GRADIENT NORM TOTAL] 4.7845

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.726
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439173  0.45608273] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.555 | Std: 0.071
[MASKS] A(Pass/Fail): 683/1365 | B: 598/1258 | C: 553/1495
[LOSS Ex1] A: 0.64261 | B: 0.62579 | C: 0.62121
[LOGITS Ex2 A] Mean Abs: 2.051 | Max: 5.978
[LOSS Ex2] A: 0.13198 | B: 0.31838 | C: 0.25612
** [JOINT LOSS] ** : 0.865362
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005790 | Grad Max: 0.179241
  -> Layer: shared_layers.0.bias | Grad Mean: 0.494274 | Grad Max: 2.299472
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.005628
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007872 | Grad Max: 0.007872
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003154 | Grad Max: 0.426194
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058426 | Grad Max: 2.382436
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000402 | Grad Max: 0.015477
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030770 | Grad Max: 0.186290
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000573
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006344 | Grad Max: 0.014038
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000258
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001646 | Grad Max: 0.003977
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001050 | Grad Max: 0.002562
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027325 | Grad Max: 0.027325
[GRADIENT NORM TOTAL] 10.4173

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.908
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.77066547 0.22933453] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.075
[MASKS] A(Pass/Fail): 745/1303 | B: 647/1401 | C: 552/1496
[LOSS Ex1] A: 0.63646 | B: 0.62508 | C: 0.62270
[LOGITS Ex2 A] Mean Abs: 2.090 | Max: 6.363
[LOSS Ex2] A: 0.11286 | B: 0.33424 | C: 0.24102
** [JOINT LOSS] ** : 0.857453
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006433 | Grad Max: 0.174484
  -> Layer: shared_layers.0.bias | Grad Mean: 0.325216 | Grad Max: 1.102158
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002237 | Grad Max: 0.006279
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006588 | Grad Max: 0.006588
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.281465
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040695 | Grad Max: 1.553119
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000297 | Grad Max: 0.008442
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022228 | Grad Max: 0.107078
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000414
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004690 | Grad Max: 0.009669
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000213
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001212 | Grad Max: 0.003179
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000802 | Grad Max: 0.002233
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020088 | Grad Max: 0.020088
[GRADIENT NORM TOTAL] 6.6148

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 1.007
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006424  0.49935755] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.074
[MASKS] A(Pass/Fail): 717/1331 | B: 633/1415 | C: 552/1496
[LOSS Ex1] A: 0.64327 | B: 0.62573 | C: 0.62231
[LOGITS Ex2 A] Mean Abs: 2.028 | Max: 5.836
[LOSS Ex2] A: 0.10524 | B: 0.35045 | C: 0.24056
** [JOINT LOSS] ** : 0.862519
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004484 | Grad Max: 0.150804
  -> Layer: shared_layers.0.bias | Grad Mean: 0.482897 | Grad Max: 2.131352
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005781
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000303 | Grad Max: 0.000303
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002969 | Grad Max: 0.533972
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055770 | Grad Max: 3.010740
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000369 | Grad Max: 0.013644
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028619 | Grad Max: 0.163690
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000436
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005797 | Grad Max: 0.011423
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000245
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001476 | Grad Max: 0.003599
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000963 | Grad Max: 0.002375
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024575 | Grad Max: 0.024575
[GRADIENT NORM TOTAL] 10.2885

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.707
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7101686  0.28983137] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.075
[MASKS] A(Pass/Fail): 711/1337 | B: 637/1411 | C: 562/1486
[LOSS Ex1] A: 0.63894 | B: 0.62135 | C: 0.61950
[LOGITS Ex2 A] Mean Abs: 2.009 | Max: 6.018
[LOSS Ex2] A: 0.13680 | B: 0.33866 | C: 0.25709
** [JOINT LOSS] ** : 0.870779
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007853 | Grad Max: 0.198707
  -> Layer: shared_layers.0.bias | Grad Mean: 0.604626 | Grad Max: 2.558968
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002200 | Grad Max: 0.005880
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000289 | Grad Max: 0.000289
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003838 | Grad Max: 0.599604
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.071500 | Grad Max: 3.367372
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000468 | Grad Max: 0.015353
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035807 | Grad Max: 0.193759
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000582
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007402 | Grad Max: 0.014274
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000312
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001938 | Grad Max: 0.004657
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001307 | Grad Max: 0.002551
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033278 | Grad Max: 0.033278
[GRADIENT NORM TOTAL] 12.7597

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.816
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6271701  0.37282997] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.075
[MASKS] A(Pass/Fail): 588/1028 | B: 598/1258 | C: 576/1472
[LOSS Ex1] A: 0.63718 | B: 0.62571 | C: 0.61850
[LOGITS Ex2 A] Mean Abs: 2.090 | Max: 8.241
[LOSS Ex2] A: 0.11312 | B: 0.32599 | C: 0.23915
** [JOINT LOSS] ** : 0.853216
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005020 | Grad Max: 0.135628
  -> Layer: shared_layers.0.bias | Grad Mean: 0.402868 | Grad Max: 1.706349
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.006866
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013702 | Grad Max: 0.013702
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002460 | Grad Max: 0.365297
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045919 | Grad Max: 2.038866
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000339 | Grad Max: 0.014651
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025832 | Grad Max: 0.155403
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000413
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005222 | Grad Max: 0.010592
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000244
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001343 | Grad Max: 0.003485
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000907 | Grad Max: 0.002377
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022743 | Grad Max: 0.022743
[GRADIENT NORM TOTAL] 8.0088

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.008
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5075223  0.49247777] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 717/1331 | B: 647/1401 | C: 556/1492
[LOSS Ex1] A: 0.63783 | B: 0.62500 | C: 0.62449
[LOGITS Ex2 A] Mean Abs: 2.089 | Max: 7.482
[LOSS Ex2] A: 0.11754 | B: 0.35272 | C: 0.25241
** [JOINT LOSS] ** : 0.869997
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006533 | Grad Max: 0.195265
  -> Layer: shared_layers.0.bias | Grad Mean: 0.534950 | Grad Max: 2.498626
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006069
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003679 | Grad Max: 0.003679
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003626 | Grad Max: 0.601007
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.067463 | Grad Max: 3.348502
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000446 | Grad Max: 0.015444
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034076 | Grad Max: 0.186182
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000563
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007048 | Grad Max: 0.014223
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000269
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001833 | Grad Max: 0.004415
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001169 | Grad Max: 0.002675
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030277 | Grad Max: 0.030277
[GRADIENT NORM TOTAL] 12.1803

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.950
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5094571 0.4905429] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 710/1338 | B: 633/1415 | C: 530/1518
[LOSS Ex1] A: 0.63467 | B: 0.62564 | C: 0.62065
[LOGITS Ex2 A] Mean Abs: 2.118 | Max: 6.159
[LOSS Ex2] A: 0.13064 | B: 0.36000 | C: 0.25676
** [JOINT LOSS] ** : 0.876123
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008920 | Grad Max: 0.317341
  -> Layer: shared_layers.0.bias | Grad Mean: 0.836068 | Grad Max: 4.156336
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.006202
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002807 | Grad Max: 0.002807
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005413 | Grad Max: 0.890409
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.101583 | Grad Max: 4.945156
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000673 | Grad Max: 0.023896
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.052314 | Grad Max: 0.288752
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000870
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010696 | Grad Max: 0.021345
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000441
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002772 | Grad Max: 0.006724
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001796 | Grad Max: 0.003634
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046030 | Grad Max: 0.046030
[GRADIENT NORM TOTAL] 18.5532

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.980
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50574607 0.49425387] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 716/1332 | B: 637/1411 | C: 551/1497
[LOSS Ex1] A: 0.63367 | B: 0.62127 | C: 0.61626
[LOGITS Ex2 A] Mean Abs: 2.088 | Max: 5.975
[LOSS Ex2] A: 0.13592 | B: 0.32244 | C: 0.24655
** [JOINT LOSS] ** : 0.858706
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006851 | Grad Max: 0.223153
  -> Layer: shared_layers.0.bias | Grad Mean: 0.606314 | Grad Max: 2.878774
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002380 | Grad Max: 0.006449
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005286 | Grad Max: 0.005286
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004003 | Grad Max: 0.609850
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074348 | Grad Max: 3.424419
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000504 | Grad Max: 0.017349
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.038924 | Grad Max: 0.220809
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000607
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008100 | Grad Max: 0.015525
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000348
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002118 | Grad Max: 0.005407
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001405 | Grad Max: 0.003183
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035809 | Grad Max: 0.035809
[GRADIENT NORM TOTAL] 13.4180

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.809
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016026  0.49839744] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.074
[MASKS] A(Pass/Fail): 684/1364 | B: 600/1256 | C: 362/1014
[LOSS Ex1] A: 0.64223 | B: 0.62563 | C: 0.62020
[LOGITS Ex2 A] Mean Abs: 2.005 | Max: 5.937
[LOSS Ex2] A: 0.11687 | B: 0.32271 | C: 0.21754
** [JOINT LOSS] ** : 0.848395
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004442 | Grad Max: 0.136458
  -> Layer: shared_layers.0.bias | Grad Mean: 0.221505 | Grad Max: 0.887782
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.006355
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009800 | Grad Max: 0.009800
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001383 | Grad Max: 0.155693
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024874 | Grad Max: 0.796285
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.006839
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013746 | Grad Max: 0.058904
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000304
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002939 | Grad Max: 0.007633
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000161
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000721 | Grad Max: 0.002269
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000530 | Grad Max: 0.001859
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010746 | Grad Max: 0.010746
[GRADIENT NORM TOTAL] 4.1589

[EPOCH SUMMARY] Train Loss: 0.8631

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8427 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 135/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.728
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438818  0.45611823] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.072
[MASKS] A(Pass/Fail): 683/1365 | B: 647/1401 | C: 563/1485
[LOSS Ex1] A: 0.64248 | B: 0.62492 | C: 0.61967
[LOGITS Ex2 A] Mean Abs: 1.978 | Max: 6.023
[LOSS Ex2] A: 0.12490 | B: 0.34802 | C: 0.23917
** [JOINT LOSS] ** : 0.866391
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003022 | Grad Max: 0.091965
  -> Layer: shared_layers.0.bias | Grad Mean: 0.287283 | Grad Max: 1.274730
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.005988
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005401 | Grad Max: 0.005401
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001743 | Grad Max: 0.600188
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032431 | Grad Max: 3.348663
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000218 | Grad Max: 0.007287
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016753 | Grad Max: 0.090358
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000342
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003398 | Grad Max: 0.007793
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000159
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000869 | Grad Max: 0.002294
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000554 | Grad Max: 0.001722
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014350 | Grad Max: 0.014350
[GRADIENT NORM TOTAL] 6.8996

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.911
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.77128774 0.22871223] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.076
[MASKS] A(Pass/Fail): 744/1304 | B: 634/1414 | C: 566/1482
[LOSS Ex1] A: 0.63633 | B: 0.62556 | C: 0.61720
[LOGITS Ex2 A] Mean Abs: 2.065 | Max: 5.636
[LOSS Ex2] A: 0.11781 | B: 0.34000 | C: 0.22905
** [JOINT LOSS] ** : 0.855318
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003797 | Grad Max: 0.097489
  -> Layer: shared_layers.0.bias | Grad Mean: 0.113416 | Grad Max: 0.438873
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002226 | Grad Max: 0.005772
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003208 | Grad Max: 0.003208
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000932 | Grad Max: 0.158976
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016561 | Grad Max: 0.893350
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000093 | Grad Max: 0.005851
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006502 | Grad Max: 0.057710
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000188
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001475 | Grad Max: 0.004062
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000107
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000405 | Grad Max: 0.001268
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000470 | Grad Max: 0.001405
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007373 | Grad Max: 0.007373
[GRADIENT NORM TOTAL] 2.9405

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 1.010
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005774  0.49942264] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.074
[MASKS] A(Pass/Fail): 717/1331 | B: 637/1411 | C: 560/1488
[LOSS Ex1] A: 0.64314 | B: 0.62119 | C: 0.61682
[LOGITS Ex2 A] Mean Abs: 2.072 | Max: 5.492
[LOSS Ex2] A: 0.10312 | B: 0.31729 | C: 0.21049
** [JOINT LOSS] ** : 0.837344
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001921 | Grad Max: 0.043189
  -> Layer: shared_layers.0.bias | Grad Mean: 0.125226 | Grad Max: 0.539824
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.005694
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001507 | Grad Max: 0.001507
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000662 | Grad Max: 0.360826
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011468 | Grad Max: 1.977722
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000039 | Grad Max: 0.002674
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001908 | Grad Max: 0.016267
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000128
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000364 | Grad Max: 0.002430
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000070
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000128 | Grad Max: 0.000695
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000637 | Grad Max: 0.001429
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001328 | Grad Max: 0.001328
[GRADIENT NORM TOTAL] 3.8245

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.709
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.710594 0.289406] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.075
[MASKS] A(Pass/Fail): 712/1336 | B: 600/1256 | C: 565/1483
[LOSS Ex1] A: 0.63879 | B: 0.62553 | C: 0.61734
[LOGITS Ex2 A] Mean Abs: 2.069 | Max: 5.581
[LOSS Ex2] A: 0.13085 | B: 0.31436 | C: 0.23428
** [JOINT LOSS] ** : 0.853718
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003286 | Grad Max: 0.067784
  -> Layer: shared_layers.0.bias | Grad Mean: 0.194833 | Grad Max: 0.924442
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006027
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004843 | Grad Max: 0.004843
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001275 | Grad Max: 0.422251
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023182 | Grad Max: 2.369298
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000114 | Grad Max: 0.004331
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008290 | Grad Max: 0.047160
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000214
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001756 | Grad Max: 0.005001
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000109
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000442 | Grad Max: 0.001389
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000414 | Grad Max: 0.001359
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006934 | Grad Max: 0.006934
[GRADIENT NORM TOTAL] 5.2124

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.819
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6273579  0.37264213] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 588/1028 | B: 647/1401 | C: 560/1488
[LOSS Ex1] A: 0.63702 | B: 0.62482 | C: 0.61849
[LOGITS Ex2 A] Mean Abs: 2.112 | Max: 6.868
[LOSS Ex2] A: 0.11668 | B: 0.34671 | C: 0.22906
** [JOINT LOSS] ** : 0.857593
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001781 | Grad Max: 0.090071
  -> Layer: shared_layers.0.bias | Grad Mean: 0.129846 | Grad Max: 1.101399
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.006179
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002934 | Grad Max: 0.002934
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000981 | Grad Max: 0.278728
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017599 | Grad Max: 1.551839
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000097 | Grad Max: 0.006464
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007243 | Grad Max: 0.058905
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000217
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001378 | Grad Max: 0.003968
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000089
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000340 | Grad Max: 0.001041
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000357 | Grad Max: 0.001039
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004430 | Grad Max: 0.004430
[GRADIENT NORM TOTAL] 3.7241

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.011
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50757766 0.4924223 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 717/1331 | B: 635/1413 | C: 548/1500
[LOSS Ex1] A: 0.63767 | B: 0.62545 | C: 0.61971
[LOGITS Ex2 A] Mean Abs: 2.106 | Max: 8.026
[LOSS Ex2] A: 0.11256 | B: 0.34007 | C: 0.23433
** [JOINT LOSS] ** : 0.856596
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001850 | Grad Max: 0.049770
  -> Layer: shared_layers.0.bias | Grad Mean: 0.074632 | Grad Max: 0.346958
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002154 | Grad Max: 0.006028
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000304 | Grad Max: 0.000304
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000593 | Grad Max: 0.120398
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010483 | Grad Max: 0.662506
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.002621
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003072 | Grad Max: 0.024801
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000151
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000581 | Grad Max: 0.002784
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000071
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000150 | Grad Max: 0.000750
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000434 | Grad Max: 0.001279
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002856 | Grad Max: 0.002856
[GRADIENT NORM TOTAL] 2.0353

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.954
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093446  0.49065536] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 711/1337 | B: 637/1411 | C: 543/1505
[LOSS Ex1] A: 0.63449 | B: 0.62105 | C: 0.61984
[LOGITS Ex2 A] Mean Abs: 2.112 | Max: 6.254
[LOSS Ex2] A: 0.12030 | B: 0.31771 | C: 0.24439
** [JOINT LOSS] ** : 0.852592
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002394 | Grad Max: 0.060113
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141248 | Grad Max: 0.764796
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002289 | Grad Max: 0.006086
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001869 | Grad Max: 0.001869
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001000 | Grad Max: 0.209856
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018073 | Grad Max: 1.162398
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000102 | Grad Max: 0.004782
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007773 | Grad Max: 0.051027
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000210
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001592 | Grad Max: 0.004615
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000091
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000407 | Grad Max: 0.001173
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000419 | Grad Max: 0.001411
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007024 | Grad Max: 0.007024
[GRADIENT NORM TOTAL] 3.4739

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.984
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.505875   0.49412507] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 716/1332 | B: 600/1256 | C: 565/1483
[LOSS Ex1] A: 0.63348 | B: 0.62539 | C: 0.61567
[LOGITS Ex2 A] Mean Abs: 2.084 | Max: 9.020
[LOSS Ex2] A: 0.12323 | B: 0.32853 | C: 0.22955
** [JOINT LOSS] ** : 0.851944
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003337 | Grad Max: 0.117587
  -> Layer: shared_layers.0.bias | Grad Mean: 0.231074 | Grad Max: 0.978036
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.006393
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001501 | Grad Max: 0.001501
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001540 | Grad Max: 0.190653
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026723 | Grad Max: 1.055891
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000167 | Grad Max: 0.006855
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012167 | Grad Max: 0.083709
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000232
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002229 | Grad Max: 0.006065
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000115
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000585 | Grad Max: 0.001543
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001351
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010379 | Grad Max: 0.010379
[GRADIENT NORM TOTAL] 4.7280

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.813
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016365 0.4983635] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.074
[MASKS] A(Pass/Fail): 684/1364 | B: 648/1400 | C: 524/1524
[LOSS Ex1] A: 0.64203 | B: 0.62466 | C: 0.62450
[LOGITS Ex2 A] Mean Abs: 2.091 | Max: 6.056
[LOSS Ex2] A: 0.11721 | B: 0.34137 | C: 0.27368
** [JOINT LOSS] ** : 0.874485
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003078 | Grad Max: 0.073093
  -> Layer: shared_layers.0.bias | Grad Mean: 0.137802 | Grad Max: 0.788849
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.005751
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005517 | Grad Max: 0.005517
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000819 | Grad Max: 0.437790
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013689 | Grad Max: 2.450477
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.003287
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002402 | Grad Max: 0.021946
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000130
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000551 | Grad Max: 0.002760
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000063
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000197 | Grad Max: 0.000947
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000227 | Grad Max: 0.000769
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005556 | Grad Max: 0.005556
[GRADIENT NORM TOTAL] 3.9368

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.731
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438446 0.4561554] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.072
[MASKS] A(Pass/Fail): 683/1365 | B: 635/1413 | C: 541/1507
[LOSS Ex1] A: 0.64228 | B: 0.62527 | C: 0.62191
[LOGITS Ex2 A] Mean Abs: 2.124 | Max: 6.116
[LOSS Ex2] A: 0.11743 | B: 0.33542 | C: 0.25674
** [JOINT LOSS] ** : 0.866353
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002233 | Grad Max: 0.069044
  -> Layer: shared_layers.0.bias | Grad Mean: 0.208314 | Grad Max: 0.971210
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.005486
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003518 | Grad Max: 0.003518
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001393 | Grad Max: 0.263251
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025499 | Grad Max: 1.475172
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000149 | Grad Max: 0.008065
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011519 | Grad Max: 0.088324
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000254
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002198 | Grad Max: 0.005591
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000117
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000548 | Grad Max: 0.001486
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000344 | Grad Max: 0.001221
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008545 | Grad Max: 0.008545
[GRADIENT NORM TOTAL] 4.8298

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.916
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7723748  0.22762527] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.076
[MASKS] A(Pass/Fail): 744/1304 | B: 638/1410 | C: 567/1481
[LOSS Ex1] A: 0.63611 | B: 0.62087 | C: 0.61930
[LOGITS Ex2 A] Mean Abs: 2.118 | Max: 5.593
[LOSS Ex2] A: 0.11154 | B: 0.32179 | C: 0.23656
** [JOINT LOSS] ** : 0.848724
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001904 | Grad Max: 0.057727
  -> Layer: shared_layers.0.bias | Grad Mean: 0.113522 | Grad Max: 0.809070
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002260 | Grad Max: 0.006340
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003366 | Grad Max: 0.003366
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000772 | Grad Max: 0.329942
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013748 | Grad Max: 1.836416
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.002839
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002723 | Grad Max: 0.021703
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000137
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000468 | Grad Max: 0.002519
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000089
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000106 | Grad Max: 0.000505
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001024
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000537 | Grad Max: 0.000537
[GRADIENT NORM TOTAL] 3.5591

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 1.016
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50055236 0.49944767] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 717/1331 | B: 600/1256 | C: 547/1501
[LOSS Ex1] A: 0.64292 | B: 0.62521 | C: 0.61716
[LOGITS Ex2 A] Mean Abs: 2.113 | Max: 6.141
[LOSS Ex2] A: 0.10776 | B: 0.33609 | C: 0.20790
** [JOINT LOSS] ** : 0.845678
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005068 | Grad Max: 0.171448
  -> Layer: shared_layers.0.bias | Grad Mean: 0.334059 | Grad Max: 1.567778
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.005436
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003976 | Grad Max: 0.003976
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.386953
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040160 | Grad Max: 2.171730
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000288 | Grad Max: 0.010675
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022128 | Grad Max: 0.116522
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000457
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004632 | Grad Max: 0.009965
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000238
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001171 | Grad Max: 0.003253
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000820 | Grad Max: 0.002402
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019110 | Grad Max: 0.019110
[GRADIENT NORM TOTAL] 7.1770

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.713
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.71135134 0.2886487 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.076
[MASKS] A(Pass/Fail): 712/1336 | B: 648/1400 | C: 534/1514
[LOSS Ex1] A: 0.63855 | B: 0.62449 | C: 0.62511
[LOGITS Ex2 A] Mean Abs: 2.115 | Max: 6.033
[LOSS Ex2] A: 0.13757 | B: 0.33948 | C: 0.26045
** [JOINT LOSS] ** : 0.875217
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003955 | Grad Max: 0.127805
  -> Layer: shared_layers.0.bias | Grad Mean: 0.180974 | Grad Max: 0.958177
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.006094
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001540 | Grad Max: 0.001540
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001257 | Grad Max: 0.165826
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022180 | Grad Max: 0.801565
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000146 | Grad Max: 0.004915
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010910 | Grad Max: 0.058415
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000228
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002385 | Grad Max: 0.005747
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000144
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000651 | Grad Max: 0.001773
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000533 | Grad Max: 0.001552
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012541 | Grad Max: 0.012541
[GRADIENT NORM TOTAL] 3.8253

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.824
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6277436  0.37225637] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.076
[MASKS] A(Pass/Fail): 588/1028 | B: 635/1413 | C: 376/1000
[LOSS Ex1] A: 0.63677 | B: 0.62509 | C: 0.62161
[LOGITS Ex2 A] Mean Abs: 2.214 | Max: 8.392
[LOSS Ex2] A: 0.12123 | B: 0.35392 | C: 0.26701
** [JOINT LOSS] ** : 0.875211
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006634 | Grad Max: 0.271393
  -> Layer: shared_layers.0.bias | Grad Mean: 0.660260 | Grad Max: 3.548107
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.006130
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008947 | Grad Max: 0.008947
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004335 | Grad Max: 0.786195
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.080710 | Grad Max: 4.377571
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000518 | Grad Max: 0.019587
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.040424 | Grad Max: 0.219390
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000668
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008217 | Grad Max: 0.016641
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000396
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002079 | Grad Max: 0.005286
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001347 | Grad Max: 0.002620
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032649 | Grad Max: 0.032649
[GRADIENT NORM TOTAL] 15.1453

[EPOCH SUMMARY] Train Loss: 0.8584

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8535 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 136/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.018
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076026  0.49239737] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 717/1331 | B: 638/1410 | C: 556/1492
[LOSS Ex1] A: 0.63742 | B: 0.62069 | C: 0.61906
[LOGITS Ex2 A] Mean Abs: 2.177 | Max: 7.438
[LOSS Ex2] A: 0.12242 | B: 0.33823 | C: 0.25633
** [JOINT LOSS] ** : 0.864719
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009357 | Grad Max: 0.296524
  -> Layer: shared_layers.0.bias | Grad Mean: 0.809483 | Grad Max: 3.858690
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.005936
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001558 | Grad Max: 0.001558
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005334 | Grad Max: 0.783591
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.099053 | Grad Max: 4.357076
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000641 | Grad Max: 0.021983
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.049701 | Grad Max: 0.253886
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000815
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010213 | Grad Max: 0.020458
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000416
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002624 | Grad Max: 0.006570
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001757 | Grad Max: 0.003515
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043310 | Grad Max: 0.043310
[GRADIENT NORM TOTAL] 17.8387

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.960
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50926834 0.49073163] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 711/1337 | B: 600/1256 | C: 579/1469
[LOSS Ex1] A: 0.63424 | B: 0.62503 | C: 0.61744
[LOGITS Ex2 A] Mean Abs: 2.168 | Max: 6.265
[LOSS Ex2] A: 0.11726 | B: 0.32062 | C: 0.21806
** [JOINT LOSS] ** : 0.844217
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007325 | Grad Max: 0.155037
  -> Layer: shared_layers.0.bias | Grad Mean: 0.423434 | Grad Max: 1.804322
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.006877
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006248 | Grad Max: 0.006248
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003025 | Grad Max: 0.351640
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055948 | Grad Max: 1.935275
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000381 | Grad Max: 0.013418
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028950 | Grad Max: 0.160872
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000540
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006047 | Grad Max: 0.012770
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000315
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001608 | Grad Max: 0.004165
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001117 | Grad Max: 0.002997
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027841 | Grad Max: 0.027841
[GRADIENT NORM TOTAL] 9.1708

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.990
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50599855 0.49400142] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 716/1332 | B: 648/1400 | C: 559/1489
[LOSS Ex1] A: 0.63323 | B: 0.62432 | C: 0.62116
[LOGITS Ex2 A] Mean Abs: 2.076 | Max: 6.916
[LOSS Ex2] A: 0.13522 | B: 0.35311 | C: 0.23427
** [JOINT LOSS] ** : 0.867102
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004468 | Grad Max: 0.188422
  -> Layer: shared_layers.0.bias | Grad Mean: 0.476903 | Grad Max: 2.556624
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002213 | Grad Max: 0.006088
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001324 | Grad Max: 0.001324
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002921 | Grad Max: 0.554850
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053507 | Grad Max: 3.075500
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.015736
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024091 | Grad Max: 0.167933
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000413
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004754 | Grad Max: 0.009862
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000192
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001232 | Grad Max: 0.002991
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000839 | Grad Max: 0.002043
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021085 | Grad Max: 0.021085
[GRADIENT NORM TOTAL] 10.8021

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.818
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50163054 0.4983695 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.075
[MASKS] A(Pass/Fail): 684/1364 | B: 635/1413 | C: 564/1484
[LOSS Ex1] A: 0.64181 | B: 0.62493 | C: 0.61817
[LOGITS Ex2 A] Mean Abs: 2.044 | Max: 5.712
[LOSS Ex2] A: 0.11853 | B: 0.36237 | C: 0.25743
** [JOINT LOSS] ** : 0.874411
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008773 | Grad Max: 0.280249
  -> Layer: shared_layers.0.bias | Grad Mean: 0.827290 | Grad Max: 3.561022
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006331
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010744 | Grad Max: 0.010744
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005230 | Grad Max: 0.774873
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.098552 | Grad Max: 4.336165
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000667 | Grad Max: 0.022421
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.051966 | Grad Max: 0.266315
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000781
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010513 | Grad Max: 0.020186
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000448
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002693 | Grad Max: 0.006615
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001859 | Grad Max: 0.003542
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045155 | Grad Max: 0.045155
[GRADIENT NORM TOTAL] 17.7050

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.735
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54389197 0.45610803] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.072
[MASKS] A(Pass/Fail): 683/1365 | B: 638/1410 | C: 550/1498
[LOSS Ex1] A: 0.64207 | B: 0.62054 | C: 0.61831
[LOGITS Ex2 A] Mean Abs: 2.028 | Max: 5.702
[LOSS Ex2] A: 0.12411 | B: 0.33631 | C: 0.22939
** [JOINT LOSS] ** : 0.856909
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006574 | Grad Max: 0.182200
  -> Layer: shared_layers.0.bias | Grad Mean: 0.575649 | Grad Max: 2.523529
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.005826
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006064 | Grad Max: 0.006064
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003574 | Grad Max: 0.595188
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.067371 | Grad Max: 3.363993
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000450 | Grad Max: 0.015499
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034832 | Grad Max: 0.179254
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000552
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006988 | Grad Max: 0.014689
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000318
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001780 | Grad Max: 0.004541
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001197 | Grad Max: 0.002835
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029073 | Grad Max: 0.029073
[GRADIENT NORM TOTAL] 12.3312

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.921
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.77354807 0.22645193] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.076
[MASKS] A(Pass/Fail): 744/1304 | B: 600/1256 | C: 543/1505
[LOSS Ex1] A: 0.63590 | B: 0.62488 | C: 0.62126
[LOGITS Ex2 A] Mean Abs: 2.105 | Max: 6.107
[LOSS Ex2] A: 0.11096 | B: 0.32448 | C: 0.25985
** [JOINT LOSS] ** : 0.859113
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004320 | Grad Max: 0.153862
  -> Layer: shared_layers.0.bias | Grad Mean: 0.115095 | Grad Max: 0.524855
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.005822
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000342 | Grad Max: 0.000342
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001061 | Grad Max: 0.189323
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018051 | Grad Max: 1.062341
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.005326
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007930 | Grad Max: 0.058952
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000231
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001849 | Grad Max: 0.004988
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000118
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000489 | Grad Max: 0.001490
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000375 | Grad Max: 0.001241
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007945 | Grad Max: 0.007945
[GRADIENT NORM TOTAL] 2.9779

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 1.021
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005431 0.4994569] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 718/1330 | B: 648/1400 | C: 596/1452
[LOSS Ex1] A: 0.64272 | B: 0.62418 | C: 0.61780
[LOGITS Ex2 A] Mean Abs: 2.139 | Max: 6.353
[LOSS Ex2] A: 0.10624 | B: 0.33564 | C: 0.23605
** [JOINT LOSS] ** : 0.854212
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002380 | Grad Max: 0.092961
  -> Layer: shared_layers.0.bias | Grad Mean: 0.229213 | Grad Max: 0.940369
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.005833
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001528 | Grad Max: 0.001528
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001545 | Grad Max: 0.243792
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028452 | Grad Max: 1.355964
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000191 | Grad Max: 0.007057
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014851 | Grad Max: 0.077911
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000301
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002994 | Grad Max: 0.006866
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000154
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000789 | Grad Max: 0.001826
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000501 | Grad Max: 0.001743
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012893 | Grad Max: 0.012893
[GRADIENT NORM TOTAL] 5.1248

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.717
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.712185   0.28781497] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.076
[MASKS] A(Pass/Fail): 713/1335 | B: 635/1413 | C: 565/1483
[LOSS Ex1] A: 0.63835 | B: 0.62479 | C: 0.61290
[LOGITS Ex2 A] Mean Abs: 2.105 | Max: 5.965
[LOSS Ex2] A: 0.13174 | B: 0.33848 | C: 0.22916
** [JOINT LOSS] ** : 0.858472
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004069 | Grad Max: 0.138012
  -> Layer: shared_layers.0.bias | Grad Mean: 0.121883 | Grad Max: 0.415941
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.006117
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002626 | Grad Max: 0.002626
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000956 | Grad Max: 0.351677
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016544 | Grad Max: 1.980028
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.003321
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005864 | Grad Max: 0.033558
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000243
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001342 | Grad Max: 0.004796
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000101
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000325 | Grad Max: 0.001141
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000349 | Grad Max: 0.001228
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004873 | Grad Max: 0.004873
[GRADIENT NORM TOTAL] 3.6294

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.828
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62821823 0.37178177] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.076
[MASKS] A(Pass/Fail): 588/1028 | B: 638/1410 | C: 580/1468
[LOSS Ex1] A: 0.63657 | B: 0.62040 | C: 0.61793
[LOGITS Ex2 A] Mean Abs: 2.131 | Max: 7.070
[LOSS Ex2] A: 0.11672 | B: 0.31727 | C: 0.24565
** [JOINT LOSS] ** : 0.851512
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003523 | Grad Max: 0.137655
  -> Layer: shared_layers.0.bias | Grad Mean: 0.077980 | Grad Max: 0.322393
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.006577
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010357 | Grad Max: 0.010357
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000911 | Grad Max: 0.111592
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015339 | Grad Max: 0.623305
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000107 | Grad Max: 0.005139
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007588 | Grad Max: 0.051329
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000219
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001785 | Grad Max: 0.004844
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000122
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000469 | Grad Max: 0.001226
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000397 | Grad Max: 0.001456
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008098 | Grad Max: 0.008098
[GRADIENT NORM TOTAL] 2.3260

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.022
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50762016 0.49237987] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.077
[MASKS] A(Pass/Fail): 717/1331 | B: 600/1256 | C: 547/1501
[LOSS Ex1] A: 0.63721 | B: 0.62473 | C: 0.61767
[LOGITS Ex2 A] Mean Abs: 2.160 | Max: 8.386
[LOSS Ex2] A: 0.10701 | B: 0.31814 | C: 0.26023
** [JOINT LOSS] ** : 0.855000
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004745 | Grad Max: 0.159380
  -> Layer: shared_layers.0.bias | Grad Mean: 0.382879 | Grad Max: 1.849231
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.005875
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001134 | Grad Max: 0.001134
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002574 | Grad Max: 0.415286
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047059 | Grad Max: 2.311259
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000301 | Grad Max: 0.010322
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023582 | Grad Max: 0.120438
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000367
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004915 | Grad Max: 0.010002
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000231
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001284 | Grad Max: 0.003205
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000788 | Grad Max: 0.002297
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021029 | Grad Max: 0.021029
[GRADIENT NORM TOTAL] 8.6286

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.964
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.509228   0.49077204] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 712/1336 | B: 648/1400 | C: 542/1506
[LOSS Ex1] A: 0.63402 | B: 0.62403 | C: 0.62211
[LOGITS Ex2 A] Mean Abs: 2.166 | Max: 6.146
[LOSS Ex2] A: 0.11928 | B: 0.33326 | C: 0.24339
** [JOINT LOSS] ** : 0.858697
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005569 | Grad Max: 0.199196
  -> Layer: shared_layers.0.bias | Grad Mean: 0.316453 | Grad Max: 1.486901
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002291 | Grad Max: 0.006211
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008093 | Grad Max: 0.008093
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002446 | Grad Max: 0.338221
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044152 | Grad Max: 1.902192
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.009507
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023736 | Grad Max: 0.111132
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000420
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005008 | Grad Max: 0.010381
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000259
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001283 | Grad Max: 0.003135
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000843 | Grad Max: 0.002079
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020909 | Grad Max: 0.020909
[GRADIENT NORM TOTAL] 7.2901

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.995
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50606596 0.49393407] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 717/1331 | B: 635/1413 | C: 556/1492
[LOSS Ex1] A: 0.63300 | B: 0.62463 | C: 0.61865
[LOGITS Ex2 A] Mean Abs: 2.096 | Max: 7.971
[LOSS Ex2] A: 0.13750 | B: 0.34829 | C: 0.24926
** [JOINT LOSS] ** : 0.870442
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004846 | Grad Max: 0.188462
  -> Layer: shared_layers.0.bias | Grad Mean: 0.324436 | Grad Max: 1.778595
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.006520
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001609 | Grad Max: 0.001609
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.523810
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035363 | Grad Max: 2.870143
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000215 | Grad Max: 0.009129
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015643 | Grad Max: 0.106349
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000259
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002745 | Grad Max: 0.006134
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000145
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000716 | Grad Max: 0.001869
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000481 | Grad Max: 0.001299
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012889 | Grad Max: 0.012889
[GRADIENT NORM TOTAL] 7.1032

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.822
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016118 0.4983882] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.075
[MASKS] A(Pass/Fail): 685/1363 | B: 638/1410 | C: 548/1500
[LOSS Ex1] A: 0.64159 | B: 0.62024 | C: 0.62264
[LOGITS Ex2 A] Mean Abs: 2.073 | Max: 6.219
[LOSS Ex2] A: 0.10899 | B: 0.32874 | C: 0.23830
** [JOINT LOSS] ** : 0.853502
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003507 | Grad Max: 0.121650
  -> Layer: shared_layers.0.bias | Grad Mean: 0.369036 | Grad Max: 1.650256
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005408
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003240 | Grad Max: 0.003240
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.275986
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041024 | Grad Max: 1.556629
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.012191
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021248 | Grad Max: 0.137525
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000375
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004058 | Grad Max: 0.009494
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000191
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001021 | Grad Max: 0.002787
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000668 | Grad Max: 0.001964
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016445 | Grad Max: 0.016445
[GRADIENT NORM TOTAL] 7.4089

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.740
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54393584 0.45606413] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.072
[MASKS] A(Pass/Fail): 683/1365 | B: 600/1256 | C: 368/1008
[LOSS Ex1] A: 0.64187 | B: 0.62457 | C: 0.62154
[LOGITS Ex2 A] Mean Abs: 2.091 | Max: 6.176
[LOSS Ex2] A: 0.11597 | B: 0.31625 | C: 0.23541
** [JOINT LOSS] ** : 0.851871
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.071276
  -> Layer: shared_layers.0.bias | Grad Mean: 0.107327 | Grad Max: 0.495634
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.006083
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002677 | Grad Max: 0.002677
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000816 | Grad Max: 0.208021
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014138 | Grad Max: 1.137867
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.003971
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002475 | Grad Max: 0.024599
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000121
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000381 | Grad Max: 0.002297
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000064
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000094 | Grad Max: 0.000610
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001138
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000119 | Grad Max: 0.000119
[GRADIENT NORM TOTAL] 3.0116

[EPOCH SUMMARY] Train Loss: 0.8586

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8412 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8421 -> New: 0.8412)

############################## EPOCH 137/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.925
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.77480894 0.22519103] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.076
[MASKS] A(Pass/Fail): 745/1303 | B: 649/1399 | C: 533/1515
[LOSS Ex1] A: 0.63567 | B: 0.62387 | C: 0.62554
[LOGITS Ex2 A] Mean Abs: 2.149 | Max: 6.392
[LOSS Ex2] A: 0.11865 | B: 0.34101 | C: 0.24559
** [JOINT LOSS] ** : 0.863443
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007277 | Grad Max: 0.186925
  -> Layer: shared_layers.0.bias | Grad Mean: 0.472116 | Grad Max: 2.158170
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005586
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002355 | Grad Max: 0.002355
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003101 | Grad Max: 0.483135
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057197 | Grad Max: 2.692997
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000400 | Grad Max: 0.015054
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030643 | Grad Max: 0.175668
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000570
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006295 | Grad Max: 0.014117
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000324
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001589 | Grad Max: 0.004309
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000999 | Grad Max: 0.002308
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024726 | Grad Max: 0.024726
[GRADIENT NORM TOTAL] 10.3127

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 1.026
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005094  0.49949065] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 718/1330 | B: 636/1412 | C: 546/1502
[LOSS Ex1] A: 0.64251 | B: 0.62447 | C: 0.61910
[LOGITS Ex2 A] Mean Abs: 2.177 | Max: 5.784
[LOSS Ex2] A: 0.10518 | B: 0.34171 | C: 0.24577
** [JOINT LOSS] ** : 0.859579
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003414 | Grad Max: 0.107920
  -> Layer: shared_layers.0.bias | Grad Mean: 0.289460 | Grad Max: 1.505160
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.005647
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002844 | Grad Max: 0.002844
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.389044
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038648 | Grad Max: 2.193226
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000238 | Grad Max: 0.011278
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018139 | Grad Max: 0.111627
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000303
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003482 | Grad Max: 0.008363
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000158
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000859 | Grad Max: 0.002354
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000459 | Grad Max: 0.001627
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012375 | Grad Max: 0.012375
[GRADIENT NORM TOTAL] 7.1690

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.721
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7130187 0.2869813] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.076
[MASKS] A(Pass/Fail): 714/1334 | B: 638/1410 | C: 553/1495
[LOSS Ex1] A: 0.63812 | B: 0.62008 | C: 0.61820
[LOGITS Ex2 A] Mean Abs: 2.110 | Max: 6.003
[LOSS Ex2] A: 0.13333 | B: 0.32058 | C: 0.21918
** [JOINT LOSS] ** : 0.849826
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005918 | Grad Max: 0.181300
  -> Layer: shared_layers.0.bias | Grad Mean: 0.289124 | Grad Max: 1.674311
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002227 | Grad Max: 0.005769
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003614 | Grad Max: 0.003614
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.389537
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034513 | Grad Max: 2.194812
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000236 | Grad Max: 0.007972
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018073 | Grad Max: 0.091532
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000365
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003824 | Grad Max: 0.008015
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000199
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000954 | Grad Max: 0.002736
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000665 | Grad Max: 0.002102
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014809 | Grad Max: 0.014809
[GRADIENT NORM TOTAL] 6.2368

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.832
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6286566 0.3713434] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.076
[MASKS] A(Pass/Fail): 589/1027 | B: 600/1256 | C: 580/1468
[LOSS Ex1] A: 0.63633 | B: 0.62441 | C: 0.61361
[LOGITS Ex2 A] Mean Abs: 2.173 | Max: 7.511
[LOSS Ex2] A: 0.11632 | B: 0.32081 | C: 0.23798
** [JOINT LOSS] ** : 0.849824
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006669 | Grad Max: 0.206724
  -> Layer: shared_layers.0.bias | Grad Mean: 0.429852 | Grad Max: 2.039979
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.006222
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005552 | Grad Max: 0.005552
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002661 | Grad Max: 0.601511
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048290 | Grad Max: 3.350161
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000315 | Grad Max: 0.012347
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024467 | Grad Max: 0.134525
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000448
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005166 | Grad Max: 0.010644
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000241
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001306 | Grad Max: 0.003127
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000893 | Grad Max: 0.002086
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021478 | Grad Max: 0.021478
[GRADIENT NORM TOTAL] 9.5909

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.028
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076454 0.4923546] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.077
[MASKS] A(Pass/Fail): 717/1331 | B: 649/1399 | C: 610/1438
[LOSS Ex1] A: 0.63700 | B: 0.62372 | C: 0.61312
[LOGITS Ex2 A] Mean Abs: 2.175 | Max: 8.167
[LOSS Ex2] A: 0.10954 | B: 0.33538 | C: 0.24061
** [JOINT LOSS] ** : 0.853118
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003081 | Grad Max: 0.126180
  -> Layer: shared_layers.0.bias | Grad Mean: 0.149661 | Grad Max: 0.741012
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006771
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005272 | Grad Max: 0.005272
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001273 | Grad Max: 0.363780
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022433 | Grad Max: 2.047498
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000107 | Grad Max: 0.004430
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007573 | Grad Max: 0.054724
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000148
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001285 | Grad Max: 0.004133
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000094
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000319 | Grad Max: 0.001185
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000336 | Grad Max: 0.001045
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004561 | Grad Max: 0.004561
[GRADIENT NORM TOTAL] 4.2451

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.970
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50918984 0.49081016] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.077
[MASKS] A(Pass/Fail): 712/1336 | B: 636/1412 | C: 582/1466
[LOSS Ex1] A: 0.63380 | B: 0.62432 | C: 0.62042
[LOGITS Ex2 A] Mean Abs: 2.193 | Max: 5.908
[LOSS Ex2] A: 0.11244 | B: 0.33998 | C: 0.23494
** [JOINT LOSS] ** : 0.855297
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003572 | Grad Max: 0.090099
  -> Layer: shared_layers.0.bias | Grad Mean: 0.274720 | Grad Max: 1.140549
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.006753
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001644 | Grad Max: 0.001644
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001888 | Grad Max: 0.312162
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034976 | Grad Max: 1.741137
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.007604
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017224 | Grad Max: 0.091453
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000304
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003428 | Grad Max: 0.008214
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000162
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000836 | Grad Max: 0.002320
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000491 | Grad Max: 0.001412
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012362 | Grad Max: 0.012362
[GRADIENT NORM TOTAL] 6.2191

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 1.000
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061461  0.49385396] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 717/1331 | B: 638/1410 | C: 560/1488
[LOSS Ex1] A: 0.63279 | B: 0.61992 | C: 0.61685
[LOGITS Ex2 A] Mean Abs: 2.115 | Max: 6.502
[LOSS Ex2] A: 0.12662 | B: 0.32286 | C: 0.23367
** [JOINT LOSS] ** : 0.850900
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003398 | Grad Max: 0.076180
  -> Layer: shared_layers.0.bias | Grad Mean: 0.195642 | Grad Max: 0.991908
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002333 | Grad Max: 0.006174
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004663 | Grad Max: 0.004663
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001290 | Grad Max: 0.587768
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022611 | Grad Max: 3.269629
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.005201
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007312 | Grad Max: 0.060499
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000194
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001386 | Grad Max: 0.004289
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000086
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000376 | Grad Max: 0.001197
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000348 | Grad Max: 0.001450
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007640 | Grad Max: 0.007640
[GRADIENT NORM TOTAL] 5.5504

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.826
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50159836 0.49840164] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.075
[MASKS] A(Pass/Fail): 685/1363 | B: 600/1256 | C: 556/1492
[LOSS Ex1] A: 0.64139 | B: 0.62425 | C: 0.61526
[LOGITS Ex2 A] Mean Abs: 2.102 | Max: 5.867
[LOSS Ex2] A: 0.11309 | B: 0.33024 | C: 0.21693
** [JOINT LOSS] ** : 0.847055
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003083 | Grad Max: 0.106989
  -> Layer: shared_layers.0.bias | Grad Mean: 0.324086 | Grad Max: 1.342922
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.005793
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006150 | Grad Max: 0.006150
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.327135
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042066 | Grad Max: 1.839834
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000280 | Grad Max: 0.012671
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022063 | Grad Max: 0.145535
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000342
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004364 | Grad Max: 0.009722
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000235
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001095 | Grad Max: 0.003269
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000705 | Grad Max: 0.002122
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017673 | Grad Max: 0.017673
[GRADIENT NORM TOTAL] 7.3099

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.743
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439039  0.45609614] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.073
[MASKS] A(Pass/Fail): 683/1365 | B: 649/1399 | C: 546/1502
[LOSS Ex1] A: 0.64168 | B: 0.62356 | C: 0.62269
[LOGITS Ex2 A] Mean Abs: 2.106 | Max: 5.640
[LOSS Ex2] A: 0.11966 | B: 0.33958 | C: 0.27049
** [JOINT LOSS] ** : 0.872554
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004560 | Grad Max: 0.131719
  -> Layer: shared_layers.0.bias | Grad Mean: 0.169226 | Grad Max: 0.784340
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005773
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009210 | Grad Max: 0.009210
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001113 | Grad Max: 0.276424
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019170 | Grad Max: 1.544752
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.003783
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007496 | Grad Max: 0.043029
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000236
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001626 | Grad Max: 0.005071
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000101
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000375 | Grad Max: 0.001243
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000221 | Grad Max: 0.000863
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004137 | Grad Max: 0.004137
[GRADIENT NORM TOTAL] 4.0438

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.929
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7758441 0.2241559] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.077
[MASKS] A(Pass/Fail): 745/1303 | B: 636/1412 | C: 551/1497
[LOSS Ex1] A: 0.63546 | B: 0.62416 | C: 0.62086
[LOGITS Ex2 A] Mean Abs: 2.166 | Max: 6.460
[LOSS Ex2] A: 0.12039 | B: 0.33985 | C: 0.21902
** [JOINT LOSS] ** : 0.853247
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004766 | Grad Max: 0.159686
  -> Layer: shared_layers.0.bias | Grad Mean: 0.282445 | Grad Max: 1.420996
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.006288
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006652 | Grad Max: 0.006652
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001892 | Grad Max: 0.406144
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033760 | Grad Max: 2.272421
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000213 | Grad Max: 0.006708
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016445 | Grad Max: 0.078791
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000325
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003531 | Grad Max: 0.008226
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000164
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000894 | Grad Max: 0.002310
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000559 | Grad Max: 0.002040
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014141 | Grad Max: 0.014141
[GRADIENT NORM TOTAL] 6.5322

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 1.031
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004766  0.49952343] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.075
[MASKS] A(Pass/Fail): 718/1330 | B: 639/1409 | C: 552/1496
[LOSS Ex1] A: 0.64231 | B: 0.61977 | C: 0.62185
[LOGITS Ex2 A] Mean Abs: 2.149 | Max: 6.117
[LOSS Ex2] A: 0.10373 | B: 0.31953 | C: 0.25016
** [JOINT LOSS] ** : 0.852449
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003519 | Grad Max: 0.090564
  -> Layer: shared_layers.0.bias | Grad Mean: 0.205184 | Grad Max: 0.990398
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005414
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000734 | Grad Max: 0.000734
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001371 | Grad Max: 0.384083
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024709 | Grad Max: 2.164959
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000155 | Grad Max: 0.006727
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011906 | Grad Max: 0.076674
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000273
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002429 | Grad Max: 0.005865
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000124
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000605 | Grad Max: 0.001722
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000476 | Grad Max: 0.001614
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010573 | Grad Max: 0.010573
[GRADIENT NORM TOTAL] 4.8816

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.725
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.71366674 0.2863333 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.077
[MASKS] A(Pass/Fail): 714/1334 | B: 600/1256 | C: 571/1477
[LOSS Ex1] A: 0.63791 | B: 0.62410 | C: 0.61275
[LOGITS Ex2 A] Mean Abs: 2.126 | Max: 6.887
[LOSS Ex2] A: 0.12326 | B: 0.31852 | C: 0.22229
** [JOINT LOSS] ** : 0.846273
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003654 | Grad Max: 0.106350
  -> Layer: shared_layers.0.bias | Grad Mean: 0.172157 | Grad Max: 0.875789
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.006185
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000322 | Grad Max: 0.000322
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001150 | Grad Max: 0.149197
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020491 | Grad Max: 0.831694
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000120 | Grad Max: 0.004877
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009228 | Grad Max: 0.055346
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000230
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001900 | Grad Max: 0.005353
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000125
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000462 | Grad Max: 0.001564
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000387 | Grad Max: 0.001532
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007420 | Grad Max: 0.007420
[GRADIENT NORM TOTAL] 3.5181

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.837
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62893605 0.371064  ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.077
[MASKS] A(Pass/Fail): 589/1027 | B: 649/1399 | C: 550/1498
[LOSS Ex1] A: 0.63611 | B: 0.62341 | C: 0.61919
[LOGITS Ex2 A] Mean Abs: 2.199 | Max: 6.337
[LOSS Ex2] A: 0.11522 | B: 0.33537 | C: 0.22686
** [JOINT LOSS] ** : 0.852055
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003157 | Grad Max: 0.138840
  -> Layer: shared_layers.0.bias | Grad Mean: 0.368306 | Grad Max: 1.769218
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002174 | Grad Max: 0.005873
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002076 | Grad Max: 0.002076
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002426 | Grad Max: 0.338023
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044898 | Grad Max: 1.899609
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.011757
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024577 | Grad Max: 0.140828
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000449
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004888 | Grad Max: 0.010275
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000200
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001231 | Grad Max: 0.003080
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000717 | Grad Max: 0.002023
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019295 | Grad Max: 0.019295
[GRADIENT NORM TOTAL] 8.0605

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.032
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076932 0.4923068] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.077
[MASKS] A(Pass/Fail): 717/1331 | B: 636/1412 | C: 377/999
[LOSS Ex1] A: 0.63678 | B: 0.62401 | C: 0.61932
[LOGITS Ex2 A] Mean Abs: 2.181 | Max: 8.456
[LOSS Ex2] A: 0.11654 | B: 0.34006 | C: 0.26308
** [JOINT LOSS] ** : 0.866596
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003732 | Grad Max: 0.113706
  -> Layer: shared_layers.0.bias | Grad Mean: 0.364956 | Grad Max: 1.536005
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.005452
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005239 | Grad Max: 0.005239
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002519 | Grad Max: 0.399716
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045968 | Grad Max: 2.228642
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000290 | Grad Max: 0.011636
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022927 | Grad Max: 0.125768
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000401
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004504 | Grad Max: 0.010382
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000211
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001097 | Grad Max: 0.002856
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000611 | Grad Max: 0.001648
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016398 | Grad Max: 0.016398
[GRADIENT NORM TOTAL] 8.4874

[EPOCH SUMMARY] Train Loss: 0.8552

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8365 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8412 -> New: 0.8365)

############################## EPOCH 138/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.974
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50907546 0.49092454] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.077
[MASKS] A(Pass/Fail): 712/1336 | B: 639/1409 | C: 580/1468
[LOSS Ex1] A: 0.63358 | B: 0.61961 | C: 0.61211
[LOGITS Ex2 A] Mean Abs: 2.156 | Max: 6.085
[LOSS Ex2] A: 0.11357 | B: 0.31548 | C: 0.20640
** [JOINT LOSS] ** : 0.833586
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002333 | Grad Max: 0.065052
  -> Layer: shared_layers.0.bias | Grad Mean: 0.169276 | Grad Max: 0.929979
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002401 | Grad Max: 0.006403
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006325 | Grad Max: 0.006325
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001164 | Grad Max: 0.451354
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020870 | Grad Max: 2.514894
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000102 | Grad Max: 0.004267
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008107 | Grad Max: 0.047648
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001734 | Grad Max: 0.004782
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000122
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000422 | Grad Max: 0.001635
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000442 | Grad Max: 0.001593
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006553 | Grad Max: 0.006553
[GRADIENT NORM TOTAL] 4.7865

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.004
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062797  0.49372026] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 717/1331 | B: 600/1256 | C: 575/1473
[LOSS Ex1] A: 0.63257 | B: 0.62394 | C: 0.61558
[LOGITS Ex2 A] Mean Abs: 2.109 | Max: 6.381
[LOSS Ex2] A: 0.13301 | B: 0.32750 | C: 0.24628
** [JOINT LOSS] ** : 0.859628
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003310 | Grad Max: 0.125029
  -> Layer: shared_layers.0.bias | Grad Mean: 0.319880 | Grad Max: 1.637002
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006619
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002541 | Grad Max: 0.002541
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.436214
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038368 | Grad Max: 2.435384
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000244 | Grad Max: 0.009399
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019267 | Grad Max: 0.107404
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000380
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003819 | Grad Max: 0.008842
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000171
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000992 | Grad Max: 0.002441
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000689 | Grad Max: 0.001666
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017753 | Grad Max: 0.017753
[GRADIENT NORM TOTAL] 7.4321

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.830
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015971  0.49840286] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.075
[MASKS] A(Pass/Fail): 685/1363 | B: 649/1399 | C: 549/1499
[LOSS Ex1] A: 0.64118 | B: 0.62326 | C: 0.61948
[LOGITS Ex2 A] Mean Abs: 2.103 | Max: 5.598
[LOSS Ex2] A: 0.11039 | B: 0.33704 | C: 0.22378
** [JOINT LOSS] ** : 0.851712
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001818 | Grad Max: 0.060862
  -> Layer: shared_layers.0.bias | Grad Mean: 0.104461 | Grad Max: 0.540215
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002113 | Grad Max: 0.005497
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000208 | Grad Max: 0.000208
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000772 | Grad Max: 0.302267
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013647 | Grad Max: 1.692483
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.002608
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002614 | Grad Max: 0.021020
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000155
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000447 | Grad Max: 0.002883
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000106 | Grad Max: 0.000617
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000419 | Grad Max: 0.001175
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001508 | Grad Max: 0.001508
[GRADIENT NORM TOTAL] 3.3307

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.747
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438777  0.45612225] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.073
[MASKS] A(Pass/Fail): 684/1364 | B: 636/1412 | C: 585/1463
[LOSS Ex1] A: 0.64148 | B: 0.62386 | C: 0.61851
[LOGITS Ex2 A] Mean Abs: 2.121 | Max: 5.697
[LOSS Ex2] A: 0.12183 | B: 0.33495 | C: 0.24669
** [JOINT LOSS] ** : 0.862439
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004202 | Grad Max: 0.177966
  -> Layer: shared_layers.0.bias | Grad Mean: 0.450845 | Grad Max: 2.282545
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.005954
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009722 | Grad Max: 0.009722
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002809 | Grad Max: 0.460598
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051993 | Grad Max: 2.577108
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000312 | Grad Max: 0.010950
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024609 | Grad Max: 0.137967
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000422
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004934 | Grad Max: 0.010851
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000218
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001244 | Grad Max: 0.003183
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000776 | Grad Max: 0.001945
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020295 | Grad Max: 0.020295
[GRADIENT NORM TOTAL] 9.9252

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.934
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7768577  0.22314233] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.077
[MASKS] A(Pass/Fail): 745/1303 | B: 639/1409 | C: 584/1464
[LOSS Ex1] A: 0.63526 | B: 0.61946 | C: 0.61706
[LOGITS Ex2 A] Mean Abs: 2.155 | Max: 5.909
[LOSS Ex2] A: 0.11868 | B: 0.32065 | C: 0.24907
** [JOINT LOSS] ** : 0.853395
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004580 | Grad Max: 0.136789
  -> Layer: shared_layers.0.bias | Grad Mean: 0.322345 | Grad Max: 1.827062
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.005691
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003593 | Grad Max: 0.003593
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.370891
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036335 | Grad Max: 2.079409
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000207 | Grad Max: 0.008967
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016119 | Grad Max: 0.107147
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000331
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003274 | Grad Max: 0.007984
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000174
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000782 | Grad Max: 0.002302
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000401 | Grad Max: 0.001351
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010754 | Grad Max: 0.010754
[GRADIENT NORM TOTAL] 7.2014

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 1.035
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004848  0.49951515] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.076
[MASKS] A(Pass/Fail): 718/1330 | B: 600/1256 | C: 596/1452
[LOSS Ex1] A: 0.64212 | B: 0.62379 | C: 0.61787
[LOGITS Ex2 A] Mean Abs: 2.144 | Max: 5.939
[LOSS Ex2] A: 0.11107 | B: 0.32434 | C: 0.24089
** [JOINT LOSS] ** : 0.853363
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004986 | Grad Max: 0.135589
  -> Layer: shared_layers.0.bias | Grad Mean: 0.293427 | Grad Max: 1.244066
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.005614
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001264 | Grad Max: 0.001264
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.221248
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033826 | Grad Max: 1.238881
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000235 | Grad Max: 0.008076
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018427 | Grad Max: 0.095651
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000330
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003890 | Grad Max: 0.008407
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000201
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000988 | Grad Max: 0.002637
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000684 | Grad Max: 0.001895
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016631 | Grad Max: 0.016631
[GRADIENT NORM TOTAL] 5.8468

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.728
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.71434695 0.28565305] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.077
[MASKS] A(Pass/Fail): 714/1334 | B: 650/1398 | C: 522/1526
[LOSS Ex1] A: 0.63771 | B: 0.62312 | C: 0.62267
[LOGITS Ex2 A] Mean Abs: 2.132 | Max: 6.479
[LOSS Ex2] A: 0.12829 | B: 0.34528 | C: 0.23903
** [JOINT LOSS] ** : 0.865365
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003760 | Grad Max: 0.089681
  -> Layer: shared_layers.0.bias | Grad Mean: 0.252089 | Grad Max: 1.163138
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.005945
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005151 | Grad Max: 0.005151
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001623 | Grad Max: 0.204726
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029346 | Grad Max: 1.141874
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000178 | Grad Max: 0.007717
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013728 | Grad Max: 0.073369
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000268
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002815 | Grad Max: 0.006843
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000141
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000709 | Grad Max: 0.001973
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000485 | Grad Max: 0.001604
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011809 | Grad Max: 0.011809
[GRADIENT NORM TOTAL] 5.4482

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.841
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6292169 0.3707831] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.077
[MASKS] A(Pass/Fail): 589/1027 | B: 636/1412 | C: 547/1501
[LOSS Ex1] A: 0.63592 | B: 0.62371 | C: 0.61941
[LOGITS Ex2 A] Mean Abs: 2.222 | Max: 7.044
[LOSS Ex2] A: 0.11281 | B: 0.33903 | C: 0.24247
** [JOINT LOSS] ** : 0.857786
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004432 | Grad Max: 0.157784
  -> Layer: shared_layers.0.bias | Grad Mean: 0.377136 | Grad Max: 1.919337
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.006127
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004616 | Grad Max: 0.004616
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002523 | Grad Max: 0.435848
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046947 | Grad Max: 2.430699
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000296 | Grad Max: 0.011801
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023253 | Grad Max: 0.146079
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000385
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004587 | Grad Max: 0.009907
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000224
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001144 | Grad Max: 0.003213
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000647 | Grad Max: 0.001903
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017459 | Grad Max: 0.017459
[GRADIENT NORM TOTAL] 8.5438

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.037
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50763553 0.49236444] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.077
[MASKS] A(Pass/Fail): 718/1330 | B: 639/1409 | C: 539/1509
[LOSS Ex1] A: 0.63658 | B: 0.61932 | C: 0.61736
[LOGITS Ex2 A] Mean Abs: 2.199 | Max: 8.061
[LOSS Ex2] A: 0.11638 | B: 0.31211 | C: 0.22783
** [JOINT LOSS] ** : 0.843194
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004325 | Grad Max: 0.182558
  -> Layer: shared_layers.0.bias | Grad Mean: 0.437227 | Grad Max: 2.230555
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002263 | Grad Max: 0.005610
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000717 | Grad Max: 0.000717
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002764 | Grad Max: 0.488119
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051504 | Grad Max: 2.716687
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000329 | Grad Max: 0.010673
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026068 | Grad Max: 0.146794
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000401
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005239 | Grad Max: 0.010774
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000221
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001353 | Grad Max: 0.003306
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000837 | Grad Max: 0.002336
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022814 | Grad Max: 0.022814
[GRADIENT NORM TOTAL] 9.6704

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.979
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50900936 0.49099064] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.077
[MASKS] A(Pass/Fail): 712/1336 | B: 600/1256 | C: 570/1478
[LOSS Ex1] A: 0.63338 | B: 0.62364 | C: 0.62072
[LOGITS Ex2 A] Mean Abs: 2.160 | Max: 6.276
[LOSS Ex2] A: 0.11742 | B: 0.31405 | C: 0.24510
** [JOINT LOSS] ** : 0.851435
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003642 | Grad Max: 0.123614
  -> Layer: shared_layers.0.bias | Grad Mean: 0.203501 | Grad Max: 0.987507
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002140 | Grad Max: 0.006332
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001091 | Grad Max: 0.001091
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001319 | Grad Max: 0.517117
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023298 | Grad Max: 2.898510
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.004717
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009680 | Grad Max: 0.053948
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000225
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001856 | Grad Max: 0.005071
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000119
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000454 | Grad Max: 0.001490
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000332 | Grad Max: 0.001169
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007313 | Grad Max: 0.007313
[GRADIENT NORM TOTAL] 5.6417

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.009
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50635856 0.49364147] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.077
[MASKS] A(Pass/Fail): 717/1331 | B: 650/1398 | C: 536/1512
[LOSS Ex1] A: 0.63237 | B: 0.62298 | C: 0.62280
[LOGITS Ex2 A] Mean Abs: 2.148 | Max: 6.619
[LOSS Ex2] A: 0.12195 | B: 0.33814 | C: 0.23896
** [JOINT LOSS] ** : 0.859062
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003451 | Grad Max: 0.107396
  -> Layer: shared_layers.0.bias | Grad Mean: 0.266407 | Grad Max: 1.339336
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.006378
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001819 | Grad Max: 0.001819
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001794 | Grad Max: 0.214231
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031957 | Grad Max: 1.175049
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000181 | Grad Max: 0.008649
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014041 | Grad Max: 0.093654
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000273
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002589 | Grad Max: 0.006702
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000153
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000654 | Grad Max: 0.002001
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000368 | Grad Max: 0.001376
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010658 | Grad Max: 0.010658
[GRADIENT NORM TOTAL] 5.8573

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.833
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.501528   0.49847195] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.557 | Std: 0.076
[MASKS] A(Pass/Fail): 685/1363 | B: 636/1412 | C: 557/1491
[LOSS Ex1] A: 0.64099 | B: 0.62357 | C: 0.61346
[LOGITS Ex2 A] Mean Abs: 2.133 | Max: 5.779
[LOSS Ex2] A: 0.11071 | B: 0.33622 | C: 0.24705
** [JOINT LOSS] ** : 0.857334
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003420 | Grad Max: 0.091146
  -> Layer: shared_layers.0.bias | Grad Mean: 0.153197 | Grad Max: 0.732175
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002115 | Grad Max: 0.006231
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009761 | Grad Max: 0.009761
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001027 | Grad Max: 0.456042
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018263 | Grad Max: 2.540079
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.004498
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005767 | Grad Max: 0.046473
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000189
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001102 | Grad Max: 0.004034
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000078
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000256 | Grad Max: 0.000962
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000343 | Grad Max: 0.000942
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003066 | Grad Max: 0.003066
[GRADIENT NORM TOTAL] 4.3899

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.750
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54389685 0.45610312] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.556 | Std: 0.073
[MASKS] A(Pass/Fail): 685/1363 | B: 639/1409 | C: 578/1470
[LOSS Ex1] A: 0.64129 | B: 0.61917 | C: 0.61488
[LOGITS Ex2 A] Mean Abs: 2.125 | Max: 5.793
[LOSS Ex2] A: 0.12070 | B: 0.30935 | C: 0.24238
** [JOINT LOSS] ** : 0.849261
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002343 | Grad Max: 0.075720
  -> Layer: shared_layers.0.bias | Grad Mean: 0.211473 | Grad Max: 1.004155
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.005899
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004036 | Grad Max: 0.004036
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.376713
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026425 | Grad Max: 2.105936
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000143 | Grad Max: 0.007041
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011207 | Grad Max: 0.079598
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000221
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002206 | Grad Max: 0.005437
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000109
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000562 | Grad Max: 0.001479
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000384 | Grad Max: 0.001326
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008802 | Grad Max: 0.008802
[GRADIENT NORM TOTAL] 5.2502

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.938
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.77792966 0.2220703 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.077
[MASKS] A(Pass/Fail): 746/1302 | B: 600/1256 | C: 360/1016
[LOSS Ex1] A: 0.63506 | B: 0.62349 | C: 0.61948
[LOGITS Ex2 A] Mean Abs: 2.129 | Max: 6.217
[LOSS Ex2] A: 0.10616 | B: 0.31738 | C: 0.21075
** [JOINT LOSS] ** : 0.837442
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003705 | Grad Max: 0.093126
  -> Layer: shared_layers.0.bias | Grad Mean: 0.218534 | Grad Max: 1.038931
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.005860
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004240 | Grad Max: 0.004240
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001428 | Grad Max: 0.169764
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026284 | Grad Max: 0.948099
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000181 | Grad Max: 0.007595
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014114 | Grad Max: 0.070695
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000286
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002862 | Grad Max: 0.007114
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000183
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000720 | Grad Max: 0.002533
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000535 | Grad Max: 0.002107
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011973 | Grad Max: 0.011973
[GRADIENT NORM TOTAL] 4.4398

[EPOCH SUMMARY] Train Loss: 0.8525

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8344 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8365 -> New: 0.8344)

############################## EPOCH 139/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 1.040
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004846  0.49951538] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.076
[MASKS] A(Pass/Fail): 718/1330 | B: 650/1398 | C: 556/1492
[LOSS Ex1] A: 0.64193 | B: 0.62283 | C: 0.62045
[LOGITS Ex2 A] Mean Abs: 2.149 | Max: 6.082
[LOSS Ex2] A: 0.10492 | B: 0.34044 | C: 0.22908
** [JOINT LOSS] ** : 0.853215
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004031 | Grad Max: 0.143488
  -> Layer: shared_layers.0.bias | Grad Mean: 0.142023 | Grad Max: 1.129064
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.005737
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000101 | Grad Max: 0.000101
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001000 | Grad Max: 0.217077
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016705 | Grad Max: 1.204916
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000059 | Grad Max: 0.003466
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003132 | Grad Max: 0.028526
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000142
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000479 | Grad Max: 0.002529
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000138 | Grad Max: 0.000723
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000400 | Grad Max: 0.001018
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001894 | Grad Max: 0.001894
[GRADIENT NORM TOTAL] 3.7371

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.732
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7151232  0.28487676] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.077
[MASKS] A(Pass/Fail): 714/1334 | B: 636/1412 | C: 533/1515
[LOSS Ex1] A: 0.63750 | B: 0.62342 | C: 0.62124
[LOGITS Ex2 A] Mean Abs: 2.146 | Max: 6.342
[LOSS Ex2] A: 0.12205 | B: 0.33360 | C: 0.23788
** [JOINT LOSS] ** : 0.858561
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004245 | Grad Max: 0.125003
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134751 | Grad Max: 0.827644
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.005745
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001920 | Grad Max: 0.001920
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001131 | Grad Max: 0.303935
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018971 | Grad Max: 1.700392
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000078 | Grad Max: 0.004403
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005038 | Grad Max: 0.042656
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000152
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000848 | Grad Max: 0.003805
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000086
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000200 | Grad Max: 0.000779
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000348 | Grad Max: 0.000875
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002517 | Grad Max: 0.002517
[GRADIENT NORM TOTAL] 3.8628

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.846
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6295932  0.37040678] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.077
[MASKS] A(Pass/Fail): 589/1027 | B: 639/1409 | C: 584/1464
[LOSS Ex1] A: 0.63570 | B: 0.61902 | C: 0.61320
[LOGITS Ex2 A] Mean Abs: 2.185 | Max: 7.784
[LOSS Ex2] A: 0.11369 | B: 0.32192 | C: 0.22582
** [JOINT LOSS] ** : 0.843115
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003821 | Grad Max: 0.120671
  -> Layer: shared_layers.0.bias | Grad Mean: 0.190586 | Grad Max: 0.852520
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002301 | Grad Max: 0.006623
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009978 | Grad Max: 0.009978
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001054 | Grad Max: 0.574919
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018910 | Grad Max: 3.199260
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.003911
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005162 | Grad Max: 0.043527
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000188
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001026 | Grad Max: 0.004116
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000095
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000253 | Grad Max: 0.000948
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000412 | Grad Max: 0.001320
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004440 | Grad Max: 0.004440
[GRADIENT NORM TOTAL] 5.6119

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.042
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.507667 0.492333] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.078
[MASKS] A(Pass/Fail): 718/1330 | B: 600/1256 | C: 542/1506
[LOSS Ex1] A: 0.63636 | B: 0.62333 | C: 0.61893
[LOGITS Ex2 A] Mean Abs: 2.173 | Max: 7.364
[LOSS Ex2] A: 0.10969 | B: 0.30902 | C: 0.25128
** [JOINT LOSS] ** : 0.849536
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003925 | Grad Max: 0.138538
  -> Layer: shared_layers.0.bias | Grad Mean: 0.182485 | Grad Max: 0.840974
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.005696
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004192 | Grad Max: 0.004192
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001137 | Grad Max: 0.440160
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018784 | Grad Max: 2.457703
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.005234
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003942 | Grad Max: 0.045788
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000126
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000517 | Grad Max: 0.002372
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000059
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000141 | Grad Max: 0.000682
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000309 | Grad Max: 0.000943
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002480 | Grad Max: 0.002480
[GRADIENT NORM TOTAL] 5.0246

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.984
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50895447 0.4910455 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.077
[MASKS] A(Pass/Fail): 713/1335 | B: 650/1398 | C: 548/1500
[LOSS Ex1] A: 0.63315 | B: 0.62267 | C: 0.61976
[LOGITS Ex2 A] Mean Abs: 2.197 | Max: 6.273
[LOSS Ex2] A: 0.11763 | B: 0.33621 | C: 0.24114
** [JOINT LOSS] ** : 0.856853
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006166 | Grad Max: 0.234545
  -> Layer: shared_layers.0.bias | Grad Mean: 0.111724 | Grad Max: 0.877695
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.006201
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000730 | Grad Max: 0.000730
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001010 | Grad Max: 0.170496
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016354 | Grad Max: 0.939621
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.003527
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005185 | Grad Max: 0.032661
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000278
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001294 | Grad Max: 0.003964
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000129
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000311 | Grad Max: 0.001100
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000294 | Grad Max: 0.001057
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004520 | Grad Max: 0.004520
[GRADIENT NORM TOTAL] 3.0691

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.014
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.506485   0.49351504] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.077
[MASKS] A(Pass/Fail): 717/1331 | B: 636/1412 | C: 566/1482
[LOSS Ex1] A: 0.63214 | B: 0.62325 | C: 0.62014
[LOGITS Ex2 A] Mean Abs: 2.157 | Max: 7.295
[LOSS Ex2] A: 0.12709 | B: 0.33031 | C: 0.23602
** [JOINT LOSS] ** : 0.856311
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004235 | Grad Max: 0.134149
  -> Layer: shared_layers.0.bias | Grad Mean: 0.125570 | Grad Max: 0.740954
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.006527
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000194 | Grad Max: 0.000194
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000974 | Grad Max: 0.215637
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016034 | Grad Max: 1.212048
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.002544
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002336 | Grad Max: 0.023016
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000189
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000329 | Grad Max: 0.002429
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000061
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000090 | Grad Max: 0.000567
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000339 | Grad Max: 0.000950
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000527 | Grad Max: 0.000527
[GRADIENT NORM TOTAL] 3.3813

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.838
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015316 0.4984684] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.076
[MASKS] A(Pass/Fail): 687/1361 | B: 639/1409 | C: 590/1458
[LOSS Ex1] A: 0.64076 | B: 0.61884 | C: 0.61376
[LOGITS Ex2 A] Mean Abs: 2.142 | Max: 6.558
[LOSS Ex2] A: 0.11201 | B: 0.31296 | C: 0.22204
** [JOINT LOSS] ** : 0.840123
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004197 | Grad Max: 0.206322
  -> Layer: shared_layers.0.bias | Grad Mean: 0.084214 | Grad Max: 0.350190
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.005896
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007876 | Grad Max: 0.007876
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001047 | Grad Max: 0.146818
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017053 | Grad Max: 0.818219
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000097 | Grad Max: 0.003480
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006979 | Grad Max: 0.034961
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000224
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001664 | Grad Max: 0.004710
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000113
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000407 | Grad Max: 0.001406
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000473 | Grad Max: 0.001640
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006646 | Grad Max: 0.006646
[GRADIENT NORM TOTAL] 2.8015

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.754
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54387975 0.45612025] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.073
[MASKS] A(Pass/Fail): 685/1363 | B: 600/1256 | C: 569/1479
[LOSS Ex1] A: 0.64107 | B: 0.62314 | C: 0.61406
[LOGITS Ex2 A] Mean Abs: 2.158 | Max: 5.935
[LOSS Ex2] A: 0.12351 | B: 0.31500 | C: 0.21692
** [JOINT LOSS] ** : 0.844566
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004034 | Grad Max: 0.124453
  -> Layer: shared_layers.0.bias | Grad Mean: 0.214177 | Grad Max: 1.068504
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.005810
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000308 | Grad Max: 0.000308
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001618 | Grad Max: 0.431269
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028355 | Grad Max: 2.415372
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000161 | Grad Max: 0.006791
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012317 | Grad Max: 0.085344
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000217
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002211 | Grad Max: 0.005690
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000116
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000583 | Grad Max: 0.001682
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000435 | Grad Max: 0.001398
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010063 | Grad Max: 0.010063
[GRADIENT NORM TOTAL] 5.5701

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.943
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.77918535 0.22081466] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.077
[MASKS] A(Pass/Fail): 746/1302 | B: 650/1398 | C: 602/1446
[LOSS Ex1] A: 0.63482 | B: 0.62248 | C: 0.61417
[LOGITS Ex2 A] Mean Abs: 2.190 | Max: 7.447
[LOSS Ex2] A: 0.10840 | B: 0.32315 | C: 0.22173
** [JOINT LOSS] ** : 0.841582
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003602 | Grad Max: 0.117547
  -> Layer: shared_layers.0.bias | Grad Mean: 0.112271 | Grad Max: 0.612127
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002258 | Grad Max: 0.006461
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002176 | Grad Max: 0.002176
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000870 | Grad Max: 0.331717
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014135 | Grad Max: 1.859245
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000053 | Grad Max: 0.002867
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002248 | Grad Max: 0.027670
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000162
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000355 | Grad Max: 0.002524
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000089 | Grad Max: 0.000549
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000407 | Grad Max: 0.001021
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000728 | Grad Max: 0.000728
[GRADIENT NORM TOTAL] 3.3935

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 1.046
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004692  0.49953073] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.076
[MASKS] A(Pass/Fail): 718/1330 | B: 636/1412 | C: 562/1486
[LOSS Ex1] A: 0.64169 | B: 0.62305 | C: 0.62071
[LOGITS Ex2 A] Mean Abs: 2.199 | Max: 6.177
[LOSS Ex2] A: 0.10550 | B: 0.33012 | C: 0.23295
** [JOINT LOSS] ** : 0.851343
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003489 | Grad Max: 0.126152
  -> Layer: shared_layers.0.bias | Grad Mean: 0.098808 | Grad Max: 0.622250
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.005521
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000637 | Grad Max: 0.000637
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000750 | Grad Max: 0.171783
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012183 | Grad Max: 0.963763
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.003939
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003215 | Grad Max: 0.030988
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000190
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000701 | Grad Max: 0.003059
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000076
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000171 | Grad Max: 0.000853
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000357 | Grad Max: 0.001424
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004047 | Grad Max: 0.004047
[GRADIENT NORM TOTAL] 2.5696

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.737
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7159942  0.28400576] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.077
[MASKS] A(Pass/Fail): 714/1334 | B: 639/1409 | C: 566/1482
[LOSS Ex1] A: 0.63724 | B: 0.61863 | C: 0.61384
[LOGITS Ex2 A] Mean Abs: 2.190 | Max: 6.254
[LOSS Ex2] A: 0.12624 | B: 0.31470 | C: 0.22872
** [JOINT LOSS] ** : 0.846453
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002918 | Grad Max: 0.093407
  -> Layer: shared_layers.0.bias | Grad Mean: 0.116116 | Grad Max: 0.549032
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.005990
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000367 | Grad Max: 0.000367
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000956 | Grad Max: 0.438946
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016518 | Grad Max: 2.449485
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.006042
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005623 | Grad Max: 0.065622
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000157
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001244 | Grad Max: 0.004208
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000096
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000331 | Grad Max: 0.001037
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000350 | Grad Max: 0.001077
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005041 | Grad Max: 0.005041
[GRADIENT NORM TOTAL] 4.0476

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.851
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.62994856 0.37005144] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.078
[MASKS] A(Pass/Fail): 589/1027 | B: 600/1256 | C: 554/1494
[LOSS Ex1] A: 0.63543 | B: 0.62293 | C: 0.61789
[LOGITS Ex2 A] Mean Abs: 2.222 | Max: 8.235
[LOSS Ex2] A: 0.11773 | B: 0.31721 | C: 0.24296
** [JOINT LOSS] ** : 0.851383
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003952 | Grad Max: 0.104058
  -> Layer: shared_layers.0.bias | Grad Mean: 0.168998 | Grad Max: 0.767948
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002147 | Grad Max: 0.006194
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010402 | Grad Max: 0.010402
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001215 | Grad Max: 0.512284
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021062 | Grad Max: 2.844548
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.005184
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007813 | Grad Max: 0.057298
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000195
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001300 | Grad Max: 0.004061
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000097
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000327 | Grad Max: 0.001131
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000263 | Grad Max: 0.001056
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006088 | Grad Max: 0.006088
[GRADIENT NORM TOTAL] 4.7346

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.049
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50768834 0.49231163] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.078
[MASKS] A(Pass/Fail): 717/1331 | B: 650/1398 | C: 593/1455
[LOSS Ex1] A: 0.63609 | B: 0.62226 | C: 0.60985
[LOGITS Ex2 A] Mean Abs: 2.209 | Max: 7.571
[LOSS Ex2] A: 0.11146 | B: 0.33009 | C: 0.23878
** [JOINT LOSS] ** : 0.849507
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005933 | Grad Max: 0.268127
  -> Layer: shared_layers.0.bias | Grad Mean: 0.150234 | Grad Max: 1.285709
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.006195
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002741 | Grad Max: 0.002741
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001262 | Grad Max: 0.162715
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020891 | Grad Max: 0.897498
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000116 | Grad Max: 0.005290
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007870 | Grad Max: 0.053138
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000279
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001851 | Grad Max: 0.005007
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000122
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000451 | Grad Max: 0.001471
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000322 | Grad Max: 0.001225
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006622 | Grad Max: 0.006622
[GRADIENT NORM TOTAL] 3.7747

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.990
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5088788  0.49112117] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.078
[MASKS] A(Pass/Fail): 713/1335 | B: 637/1411 | C: 324/1052
[LOSS Ex1] A: 0.63287 | B: 0.62284 | C: 0.62856
[LOGITS Ex2 A] Mean Abs: 2.200 | Max: 6.352
[LOSS Ex2] A: 0.11230 | B: 0.33043 | C: 0.23014
** [JOINT LOSS] ** : 0.852378
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002883 | Grad Max: 0.094970
  -> Layer: shared_layers.0.bias | Grad Mean: 0.071526 | Grad Max: 0.332434
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.005989
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001177 | Grad Max: 0.001177
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000813 | Grad Max: 0.149971
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013582 | Grad Max: 0.823411
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.003928
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003853 | Grad Max: 0.036735
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000138
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000579 | Grad Max: 0.002913
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000076
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000165 | Grad Max: 0.000808
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000326 | Grad Max: 0.001150
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003642 | Grad Max: 0.003642
[GRADIENT NORM TOTAL] 2.3384

[EPOCH SUMMARY] Train Loss: 0.8496

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8312 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8344 -> New: 0.8312)

############################## EPOCH 140/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.021
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5065903  0.49340966] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.077
[MASKS] A(Pass/Fail): 717/1331 | B: 639/1409 | C: 549/1499
[LOSS Ex1] A: 0.63185 | B: 0.61841 | C: 0.62371
[LOGITS Ex2 A] Mean Abs: 2.200 | Max: 6.199
[LOSS Ex2] A: 0.12436 | B: 0.31335 | C: 0.24588
** [JOINT LOSS] ** : 0.852522
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002557 | Grad Max: 0.081023
  -> Layer: shared_layers.0.bias | Grad Mean: 0.136952 | Grad Max: 0.933232
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.005628
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001269 | Grad Max: 0.001269
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001033 | Grad Max: 0.319262
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018289 | Grad Max: 1.789287
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000066 | Grad Max: 0.003413
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004780 | Grad Max: 0.032133
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000180
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000909 | Grad Max: 0.003650
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000081
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000188 | Grad Max: 0.000934
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000236 | Grad Max: 0.000745
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001328 | Grad Max: 0.001328
[GRADIENT NORM TOTAL] 4.0768

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.844
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50149477 0.4985053 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.076
[MASKS] A(Pass/Fail): 689/1359 | B: 600/1256 | C: 581/1467
[LOSS Ex1] A: 0.64049 | B: 0.62271 | C: 0.61743
[LOGITS Ex2 A] Mean Abs: 2.182 | Max: 7.962
[LOSS Ex2] A: 0.11912 | B: 0.31035 | C: 0.22272
** [JOINT LOSS] ** : 0.844273
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007425 | Grad Max: 0.308850
  -> Layer: shared_layers.0.bias | Grad Mean: 0.329574 | Grad Max: 1.441409
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.006352
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003586 | Grad Max: 0.003586
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002403 | Grad Max: 0.522059
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042899 | Grad Max: 2.934866
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000294 | Grad Max: 0.009618
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022261 | Grad Max: 0.116798
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000439
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004637 | Grad Max: 0.010082
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000257
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001131 | Grad Max: 0.003273
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000810 | Grad Max: 0.002433
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018449 | Grad Max: 0.018449
[GRADIENT NORM TOTAL] 7.7924

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.759
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54389066 0.4561094 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.074
[MASKS] A(Pass/Fail): 685/1363 | B: 650/1398 | C: 555/1493
[LOSS Ex1] A: 0.64082 | B: 0.62205 | C: 0.61862
[LOGITS Ex2 A] Mean Abs: 2.217 | Max: 6.138
[LOSS Ex2] A: 0.12243 | B: 0.34028 | C: 0.21690
** [JOINT LOSS] ** : 0.853698
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007355 | Grad Max: 0.340115
  -> Layer: shared_layers.0.bias | Grad Mean: 0.191857 | Grad Max: 0.734088
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005848
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005498 | Grad Max: 0.005498
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001657 | Grad Max: 0.333595
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025889 | Grad Max: 1.831378
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000121 | Grad Max: 0.008504
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005879 | Grad Max: 0.091006
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000219
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000589 | Grad Max: 0.003144
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000057
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000159 | Grad Max: 0.000730
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000265 | Grad Max: 0.000739
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002610 | Grad Max: 0.002610
[GRADIENT NORM TOTAL] 4.9227

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.950
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.78066885 0.21933112] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.078
[MASKS] A(Pass/Fail): 747/1301 | B: 638/1410 | C: 537/1511
[LOSS Ex1] A: 0.63455 | B: 0.62262 | C: 0.61726
[LOGITS Ex2 A] Mean Abs: 2.269 | Max: 6.548
[LOSS Ex2] A: 0.11223 | B: 0.34179 | C: 0.23740
** [JOINT LOSS] ** : 0.855283
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004762 | Grad Max: 0.147816
  -> Layer: shared_layers.0.bias | Grad Mean: 0.376326 | Grad Max: 1.958760
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.005849
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002718 | Grad Max: 0.002718
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002542 | Grad Max: 0.486189
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046394 | Grad Max: 2.661593
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.012278
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022979 | Grad Max: 0.149557
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000406
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004237 | Grad Max: 0.009814
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000206
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001016 | Grad Max: 0.002945
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000570 | Grad Max: 0.001508
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015067 | Grad Max: 0.015067
[GRADIENT NORM TOTAL] 8.6724

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 1.054
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50047594 0.49952406] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.077
[MASKS] A(Pass/Fail): 719/1329 | B: 639/1409 | C: 590/1458
[LOSS Ex1] A: 0.64144 | B: 0.61819 | C: 0.61213
[LOGITS Ex2 A] Mean Abs: 2.254 | Max: 5.960
[LOSS Ex2] A: 0.10961 | B: 0.31058 | C: 0.23854
** [JOINT LOSS] ** : 0.843498
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004657 | Grad Max: 0.162262
  -> Layer: shared_layers.0.bias | Grad Mean: 0.163264 | Grad Max: 0.772581
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002151 | Grad Max: 0.006117
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001233 | Grad Max: 0.001233
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001124 | Grad Max: 0.529504
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018161 | Grad Max: 2.933545
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.002431
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002860 | Grad Max: 0.022263
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000186
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000570 | Grad Max: 0.002912
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000063
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000138 | Grad Max: 0.000694
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000304 | Grad Max: 0.000933
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002006 | Grad Max: 0.002006
[GRADIENT NORM TOTAL] 5.3078

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.742
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7170845 0.2829155] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.078
[MASKS] A(Pass/Fail): 714/1334 | B: 602/1254 | C: 598/1450
[LOSS Ex1] A: 0.63697 | B: 0.62250 | C: 0.61476
[LOGITS Ex2 A] Mean Abs: 2.214 | Max: 6.455
[LOSS Ex2] A: 0.12937 | B: 0.32338 | C: 0.22349
** [JOINT LOSS] ** : 0.850158
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003974 | Grad Max: 0.127598
  -> Layer: shared_layers.0.bias | Grad Mean: 0.388217 | Grad Max: 1.835605
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006125
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001295 | Grad Max: 0.001295
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002422 | Grad Max: 0.605514
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044039 | Grad Max: 3.387959
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000270 | Grad Max: 0.009023
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021654 | Grad Max: 0.110027
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000343
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004342 | Grad Max: 0.009834
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000232
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001064 | Grad Max: 0.003170
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000635 | Grad Max: 0.001762
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016656 | Grad Max: 0.016656
[GRADIENT NORM TOTAL] 8.6722

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.857
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6305006  0.36949936] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.078
[MASKS] A(Pass/Fail): 589/1027 | B: 650/1398 | C: 558/1490
[LOSS Ex1] A: 0.63517 | B: 0.62185 | C: 0.61701
[LOGITS Ex2 A] Mean Abs: 2.276 | Max: 10.290
[LOSS Ex2] A: 0.11885 | B: 0.33699 | C: 0.24143
** [JOINT LOSS] ** : 0.857102
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004764 | Grad Max: 0.135540
  -> Layer: shared_layers.0.bias | Grad Mean: 0.149681 | Grad Max: 0.765896
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005956
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002787 | Grad Max: 0.002787
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001101 | Grad Max: 0.151988
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018435 | Grad Max: 0.835920
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000064 | Grad Max: 0.004143
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003206 | Grad Max: 0.048397
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000152
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000464 | Grad Max: 0.002437
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000071
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000133 | Grad Max: 0.000671
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000180 | Grad Max: 0.000608
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003281 | Grad Max: 0.003281
[GRADIENT NORM TOTAL] 3.2094

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.055
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.507633   0.49236706] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.078
[MASKS] A(Pass/Fail): 718/1330 | B: 638/1410 | C: 572/1476
[LOSS Ex1] A: 0.63583 | B: 0.62242 | C: 0.61337
[LOGITS Ex2 A] Mean Abs: 2.279 | Max: 7.833
[LOSS Ex2] A: 0.10507 | B: 0.33251 | C: 0.21231
** [JOINT LOSS] ** : 0.840503
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007637 | Grad Max: 0.247156
  -> Layer: shared_layers.0.bias | Grad Mean: 0.413837 | Grad Max: 2.193250
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.005872
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001592 | Grad Max: 0.001592
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002823 | Grad Max: 0.437951
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051527 | Grad Max: 2.434974
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000329 | Grad Max: 0.011030
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025522 | Grad Max: 0.135162
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000457
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005277 | Grad Max: 0.011653
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000250
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001301 | Grad Max: 0.003416
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000830 | Grad Max: 0.002193
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020436 | Grad Max: 0.020436
[GRADIENT NORM TOTAL] 9.2171

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.996
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.508936   0.49106395] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.078
[MASKS] A(Pass/Fail): 713/1335 | B: 639/1409 | C: 556/1492
[LOSS Ex1] A: 0.63260 | B: 0.61800 | C: 0.61405
[LOGITS Ex2 A] Mean Abs: 2.244 | Max: 6.921
[LOSS Ex2] A: 0.11842 | B: 0.30909 | C: 0.22818
** [JOINT LOSS] ** : 0.840114
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006240 | Grad Max: 0.191194
  -> Layer: shared_layers.0.bias | Grad Mean: 0.298517 | Grad Max: 1.778480
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002349 | Grad Max: 0.005851
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002061 | Grad Max: 0.002061
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002049 | Grad Max: 0.339120
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037026 | Grad Max: 1.894032
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.007368
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017139 | Grad Max: 0.100450
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000343
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003600 | Grad Max: 0.007821
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000188
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000892 | Grad Max: 0.002556
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000554 | Grad Max: 0.001615
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014095 | Grad Max: 0.014095
[GRADIENT NORM TOTAL] 6.9127

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.027
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50661    0.49339002] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.078
[MASKS] A(Pass/Fail): 717/1331 | B: 602/1254 | C: 517/1531
[LOSS Ex1] A: 0.63159 | B: 0.62230 | C: 0.62133
[LOGITS Ex2 A] Mean Abs: 2.189 | Max: 7.298
[LOSS Ex2] A: 0.12330 | B: 0.33132 | C: 0.23693
** [JOINT LOSS] ** : 0.855591
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004235 | Grad Max: 0.135675
  -> Layer: shared_layers.0.bias | Grad Mean: 0.370391 | Grad Max: 2.068271
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002295 | Grad Max: 0.006791
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008732 | Grad Max: 0.008732
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002451 | Grad Max: 0.503122
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044481 | Grad Max: 2.809725
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000306 | Grad Max: 0.010489
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024371 | Grad Max: 0.139562
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000381
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004543 | Grad Max: 0.009598
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000214
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001084 | Grad Max: 0.003023
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000667 | Grad Max: 0.001718
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017072 | Grad Max: 0.017072
[GRADIENT NORM TOTAL] 8.3702

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.848
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50139356 0.49860647] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.077
[MASKS] A(Pass/Fail): 689/1359 | B: 650/1398 | C: 551/1497
[LOSS Ex1] A: 0.64026 | B: 0.62166 | C: 0.61863
[LOGITS Ex2 A] Mean Abs: 2.139 | Max: 6.503
[LOSS Ex2] A: 0.10567 | B: 0.34140 | C: 0.26369
** [JOINT LOSS] ** : 0.863774
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007571 | Grad Max: 0.176825
  -> Layer: shared_layers.0.bias | Grad Mean: 0.418401 | Grad Max: 2.074460
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.006120
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009362 | Grad Max: 0.009362
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002934 | Grad Max: 0.624430
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054114 | Grad Max: 3.519325
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000372 | Grad Max: 0.012522
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029494 | Grad Max: 0.174911
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000467
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006054 | Grad Max: 0.012255
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000247
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001557 | Grad Max: 0.003579
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001118 | Grad Max: 0.002526
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027746 | Grad Max: 0.027746
[GRADIENT NORM TOTAL] 9.4885

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.764
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54400074 0.45599923] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.074
[MASKS] A(Pass/Fail): 686/1362 | B: 638/1410 | C: 569/1479
[LOSS Ex1] A: 0.64060 | B: 0.62224 | C: 0.61598
[LOGITS Ex2 A] Mean Abs: 2.165 | Max: 6.582
[LOSS Ex2] A: 0.11798 | B: 0.33816 | C: 0.24011
** [JOINT LOSS] ** : 0.858357
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005100 | Grad Max: 0.127134
  -> Layer: shared_layers.0.bias | Grad Mean: 0.231408 | Grad Max: 1.278394
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.006428
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011655 | Grad Max: 0.011655
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001470 | Grad Max: 0.435101
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025580 | Grad Max: 2.404567
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.004405
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003630 | Grad Max: 0.037924
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000127
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000538 | Grad Max: 0.002817
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000083
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000112 | Grad Max: 0.000857
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000287 | Grad Max: 0.001151
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000052 | Grad Max: 0.000052
[GRADIENT NORM TOTAL] 5.6420

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.955
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.78205    0.21794997] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.078
[MASKS] A(Pass/Fail): 749/1299 | B: 639/1409 | C: 584/1464
[LOSS Ex1] A: 0.63431 | B: 0.61783 | C: 0.61129
[LOGITS Ex2 A] Mean Abs: 2.225 | Max: 7.405
[LOSS Ex2] A: 0.11355 | B: 0.32480 | C: 0.25043
** [JOINT LOSS] ** : 0.850733
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003857 | Grad Max: 0.255086
  -> Layer: shared_layers.0.bias | Grad Mean: 0.643389 | Grad Max: 3.158346
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002364 | Grad Max: 0.006040
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006500 | Grad Max: 0.006500
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004084 | Grad Max: 0.787372
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.075941 | Grad Max: 4.392558
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000456 | Grad Max: 0.018411
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037674 | Grad Max: 0.235381
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000544
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007328 | Grad Max: 0.015448
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000311
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001800 | Grad Max: 0.004874
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000983 | Grad Max: 0.002244
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028225 | Grad Max: 0.028225
[GRADIENT NORM TOTAL] 15.0116

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.059
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500508 0.499492] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.077
[MASKS] A(Pass/Fail): 720/1328 | B: 602/1254 | C: 384/992
[LOSS Ex1] A: 0.64123 | B: 0.62214 | C: 0.61789
[LOGITS Ex2 A] Mean Abs: 2.220 | Max: 5.914
[LOSS Ex2] A: 0.10563 | B: 0.30831 | C: 0.25821
** [JOINT LOSS] ** : 0.851134
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003774 | Grad Max: 0.107573
  -> Layer: shared_layers.0.bias | Grad Mean: 0.358963 | Grad Max: 1.465695
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.005599
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001563 | Grad Max: 0.001563
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002515 | Grad Max: 0.417776
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046133 | Grad Max: 2.321572
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000286 | Grad Max: 0.010567
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023099 | Grad Max: 0.128929
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000382
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004304 | Grad Max: 0.009352
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000173
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001032 | Grad Max: 0.002567
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000576 | Grad Max: 0.001737
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015235 | Grad Max: 0.015235
[GRADIENT NORM TOTAL] 8.4015

[EPOCH SUMMARY] Train Loss: 0.8512

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8325 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 141/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.746
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.71806955 0.28193048] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.078
[MASKS] A(Pass/Fail): 714/1334 | B: 650/1398 | C: 556/1492
[LOSS Ex1] A: 0.63675 | B: 0.62151 | C: 0.62048
[LOGITS Ex2 A] Mean Abs: 2.168 | Max: 7.404
[LOSS Ex2] A: 0.13296 | B: 0.33860 | C: 0.23933
** [JOINT LOSS] ** : 0.863211
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006865 | Grad Max: 0.165085
  -> Layer: shared_layers.0.bias | Grad Mean: 0.422659 | Grad Max: 2.043360
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002060 | Grad Max: 0.005883
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008484 | Grad Max: 0.008484
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002571 | Grad Max: 0.391884
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047330 | Grad Max: 2.057196
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000307 | Grad Max: 0.010217
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024227 | Grad Max: 0.136355
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000468
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004914 | Grad Max: 0.011162
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000231
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001246 | Grad Max: 0.003449
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000852 | Grad Max: 0.001976
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021473 | Grad Max: 0.021473
[GRADIENT NORM TOTAL] 8.5403

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.862
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6309649  0.36903512] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.078
[MASKS] A(Pass/Fail): 589/1027 | B: 638/1410 | C: 534/1514
[LOSS Ex1] A: 0.63495 | B: 0.62210 | C: 0.62380
[LOGITS Ex2 A] Mean Abs: 2.210 | Max: 7.140
[LOSS Ex2] A: 0.10865 | B: 0.33398 | C: 0.23800
** [JOINT LOSS] ** : 0.853824
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006106 | Grad Max: 0.210175
  -> Layer: shared_layers.0.bias | Grad Mean: 0.543747 | Grad Max: 2.841476
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.006046
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004787 | Grad Max: 0.004787
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003296 | Grad Max: 0.789017
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.061478 | Grad Max: 4.366045
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000371 | Grad Max: 0.014452
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029746 | Grad Max: 0.192105
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000543
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005904 | Grad Max: 0.013508
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000269
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001491 | Grad Max: 0.003896
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001008 | Grad Max: 0.002476
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025858 | Grad Max: 0.025858
[GRADIENT NORM TOTAL] 12.6216

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.061
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50765634 0.4923436 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.079
[MASKS] A(Pass/Fail): 718/1330 | B: 639/1409 | C: 579/1469
[LOSS Ex1] A: 0.63562 | B: 0.61769 | C: 0.61272
[LOGITS Ex2 A] Mean Abs: 2.225 | Max: 7.728
[LOSS Ex2] A: 0.10557 | B: 0.31006 | C: 0.22311
** [JOINT LOSS] ** : 0.834920
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002296 | Grad Max: 0.058193
  -> Layer: shared_layers.0.bias | Grad Mean: 0.145778 | Grad Max: 0.649364
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002278 | Grad Max: 0.005523
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003038 | Grad Max: 0.003038
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000879 | Grad Max: 0.531998
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015332 | Grad Max: 2.950930
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.002626
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002311 | Grad Max: 0.018051
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000156
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000388 | Grad Max: 0.002882
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000068
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000108 | Grad Max: 0.000773
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000420 | Grad Max: 0.001182
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000577 | Grad Max: 0.000577
[GRADIENT NORM TOTAL] 5.0357

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.001
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5088294 0.4911706] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.078
[MASKS] A(Pass/Fail): 713/1335 | B: 602/1254 | C: 566/1482
[LOSS Ex1] A: 0.63238 | B: 0.62200 | C: 0.61461
[LOGITS Ex2 A] Mean Abs: 2.242 | Max: 6.243
[LOSS Ex2] A: 0.11587 | B: 0.31362 | C: 0.23751
** [JOINT LOSS] ** : 0.845329
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006509 | Grad Max: 0.175861
  -> Layer: shared_layers.0.bias | Grad Mean: 0.429425 | Grad Max: 2.069848
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002250 | Grad Max: 0.006205
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001784 | Grad Max: 0.001784
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002889 | Grad Max: 0.474126
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052524 | Grad Max: 2.664815
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000350 | Grad Max: 0.014731
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027788 | Grad Max: 0.152766
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000419
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005681 | Grad Max: 0.011556
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000233
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001452 | Grad Max: 0.003692
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000902 | Grad Max: 0.002170
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024165 | Grad Max: 0.024165
[GRADIENT NORM TOTAL] 9.3701

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.032
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067053  0.49329472] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.078
[MASKS] A(Pass/Fail): 717/1331 | B: 650/1398 | C: 558/1490
[LOSS Ex1] A: 0.63138 | B: 0.62138 | C: 0.61338
[LOGITS Ex2 A] Mean Abs: 2.216 | Max: 6.944
[LOSS Ex2] A: 0.12910 | B: 0.32976 | C: 0.23933
** [JOINT LOSS] ** : 0.854778
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007923 | Grad Max: 0.246288
  -> Layer: shared_layers.0.bias | Grad Mean: 0.360676 | Grad Max: 1.665068
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002290 | Grad Max: 0.006130
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006900 | Grad Max: 0.006900
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002587 | Grad Max: 0.480935
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046556 | Grad Max: 2.687396
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000329 | Grad Max: 0.012096
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025674 | Grad Max: 0.126939
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000453
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005365 | Grad Max: 0.011309
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000270
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001309 | Grad Max: 0.003931
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000742 | Grad Max: 0.002091
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019436 | Grad Max: 0.019436
[GRADIENT NORM TOTAL] 8.0701

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.852
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50141203 0.49858797] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.077
[MASKS] A(Pass/Fail): 689/1359 | B: 638/1410 | C: 535/1513
[LOSS Ex1] A: 0.64006 | B: 0.62197 | C: 0.61956
[LOGITS Ex2 A] Mean Abs: 2.137 | Max: 6.747
[LOSS Ex2] A: 0.10880 | B: 0.33890 | C: 0.24637
** [JOINT LOSS] ** : 0.858554
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003791 | Grad Max: 0.179492
  -> Layer: shared_layers.0.bias | Grad Mean: 0.462530 | Grad Max: 2.464746
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002030 | Grad Max: 0.005616
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006878 | Grad Max: 0.006878
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003084 | Grad Max: 0.618228
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057700 | Grad Max: 3.477134
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000362 | Grad Max: 0.014352
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029584 | Grad Max: 0.160496
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000478
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005813 | Grad Max: 0.012739
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000249
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001424 | Grad Max: 0.003831
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000897 | Grad Max: 0.002111
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023576 | Grad Max: 0.023576
[GRADIENT NORM TOTAL] 10.7159

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.767
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438761  0.45612392] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.074
[MASKS] A(Pass/Fail): 686/1362 | B: 639/1409 | C: 562/1486
[LOSS Ex1] A: 0.64041 | B: 0.61756 | C: 0.61368
[LOGITS Ex2 A] Mean Abs: 2.100 | Max: 6.623
[LOSS Ex2] A: 0.11936 | B: 0.31683 | C: 0.22766
** [JOINT LOSS] ** : 0.845170
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004313 | Grad Max: 0.167274
  -> Layer: shared_layers.0.bias | Grad Mean: 0.494722 | Grad Max: 2.357935
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.005347
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003307 | Grad Max: 0.003307
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003209 | Grad Max: 0.590102
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059836 | Grad Max: 3.304147
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.014037
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031471 | Grad Max: 0.176374
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000574
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006190 | Grad Max: 0.015227
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000249
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001500 | Grad Max: 0.004151
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000850 | Grad Max: 0.002246
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023187 | Grad Max: 0.023187
[GRADIENT NORM TOTAL] 11.0444

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.960
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.782946   0.21705402] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.078
[MASKS] A(Pass/Fail): 747/1301 | B: 602/1254 | C: 568/1480
[LOSS Ex1] A: 0.63412 | B: 0.62187 | C: 0.61537
[LOGITS Ex2 A] Mean Abs: 2.169 | Max: 6.466
[LOSS Ex2] A: 0.10306 | B: 0.31667 | C: 0.23479
** [JOINT LOSS] ** : 0.841963
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002459 | Grad Max: 0.089490
  -> Layer: shared_layers.0.bias | Grad Mean: 0.100319 | Grad Max: 0.615362
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002295 | Grad Max: 0.006046
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009890 | Grad Max: 0.009890
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000867 | Grad Max: 0.203104
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014962 | Grad Max: 1.123085
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003092
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002910 | Grad Max: 0.021571
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000155
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000521 | Grad Max: 0.003843
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000082
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000121 | Grad Max: 0.000906
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000363 | Grad Max: 0.001164
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001485 | Grad Max: 0.001485
[GRADIENT NORM TOTAL] 3.0676

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.064
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004384  0.49956158] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.077
[MASKS] A(Pass/Fail): 720/1328 | B: 650/1398 | C: 550/1498
[LOSS Ex1] A: 0.64105 | B: 0.62125 | C: 0.61981
[LOGITS Ex2 A] Mean Abs: 2.219 | Max: 5.949
[LOSS Ex2] A: 0.10965 | B: 0.32654 | C: 0.22761
** [JOINT LOSS] ** : 0.848642
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004757 | Grad Max: 0.236828
  -> Layer: shared_layers.0.bias | Grad Mean: 0.540930 | Grad Max: 2.712476
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.005894
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004723 | Grad Max: 0.004723
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003303 | Grad Max: 0.584947
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.061773 | Grad Max: 3.252486
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.014760
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031442 | Grad Max: 0.177671
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000460
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006192 | Grad Max: 0.012402
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000252
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001570 | Grad Max: 0.004029
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000906 | Grad Max: 0.002369
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025597 | Grad Max: 0.025597
[GRADIENT NORM TOTAL] 11.9135

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.750
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.718693   0.28130698] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.078
[MASKS] A(Pass/Fail): 714/1334 | B: 638/1410 | C: 577/1471
[LOSS Ex1] A: 0.63656 | B: 0.62183 | C: 0.61545
[LOGITS Ex2 A] Mean Abs: 2.205 | Max: 6.243
[LOSS Ex2] A: 0.12654 | B: 0.33337 | C: 0.22883
** [JOINT LOSS] ** : 0.854198
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003562 | Grad Max: 0.154519
  -> Layer: shared_layers.0.bias | Grad Mean: 0.458187 | Grad Max: 2.116078
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.006193
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001208 | Grad Max: 0.001208
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003025 | Grad Max: 0.507045
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055470 | Grad Max: 2.814749
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000323 | Grad Max: 0.013262
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026114 | Grad Max: 0.142583
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000438
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004974 | Grad Max: 0.010662
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000232
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001221 | Grad Max: 0.003463
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000662 | Grad Max: 0.002159
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019025 | Grad Max: 0.019025
[GRADIENT NORM TOTAL] 10.7985

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.866
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6312126  0.36878747] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.078
[MASKS] A(Pass/Fail): 590/1026 | B: 639/1409 | C: 580/1468
[LOSS Ex1] A: 0.63477 | B: 0.61742 | C: 0.61582
[LOGITS Ex2 A] Mean Abs: 2.220 | Max: 9.098
[LOSS Ex2] A: 0.10963 | B: 0.30616 | C: 0.23913
** [JOINT LOSS] ** : 0.840976
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002982 | Grad Max: 0.077644
  -> Layer: shared_layers.0.bias | Grad Mean: 0.182223 | Grad Max: 0.822475
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002247 | Grad Max: 0.006321
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009482 | Grad Max: 0.009482
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001139 | Grad Max: 0.613079
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020449 | Grad Max: 3.409163
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.004966
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008384 | Grad Max: 0.050990
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001775 | Grad Max: 0.004951
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000126
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000448 | Grad Max: 0.001676
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000398 | Grad Max: 0.001676
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008408 | Grad Max: 0.008408
[GRADIENT NORM TOTAL] 5.3798

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.065
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076763  0.49232367] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.079
[MASKS] A(Pass/Fail): 718/1330 | B: 602/1254 | C: 573/1475
[LOSS Ex1] A: 0.63544 | B: 0.62172 | C: 0.61562
[LOGITS Ex2 A] Mean Abs: 2.180 | Max: 9.027
[LOSS Ex2] A: 0.10307 | B: 0.31068 | C: 0.24046
** [JOINT LOSS] ** : 0.842331
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004214 | Grad Max: 0.154785
  -> Layer: shared_layers.0.bias | Grad Mean: 0.215698 | Grad Max: 0.996888
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.006121
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001610 | Grad Max: 0.001610
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001527 | Grad Max: 0.206105
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027061 | Grad Max: 1.144174
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000171 | Grad Max: 0.006426
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013730 | Grad Max: 0.068284
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000263
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002796 | Grad Max: 0.006502
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000165
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000690 | Grad Max: 0.002296
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000437 | Grad Max: 0.001662
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010858 | Grad Max: 0.010858
[GRADIENT NORM TOTAL] 4.7817

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.005
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5087766  0.49122337] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.079
[MASKS] A(Pass/Fail): 714/1334 | B: 650/1398 | C: 579/1469
[LOSS Ex1] A: 0.63220 | B: 0.62111 | C: 0.61155
[LOGITS Ex2 A] Mean Abs: 2.199 | Max: 6.869
[LOSS Ex2] A: 0.11405 | B: 0.33381 | C: 0.21532
** [JOINT LOSS] ** : 0.842684
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002519 | Grad Max: 0.067958
  -> Layer: shared_layers.0.bias | Grad Mean: 0.153370 | Grad Max: 0.828845
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002233 | Grad Max: 0.006448
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000788 | Grad Max: 0.000788
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000991 | Grad Max: 0.207923
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017804 | Grad Max: 1.163873
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000093 | Grad Max: 0.004858
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007227 | Grad Max: 0.053119
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000197
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001476 | Grad Max: 0.004538
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000094
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000369 | Grad Max: 0.001174
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000385 | Grad Max: 0.001134
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005902 | Grad Max: 0.005902
[GRADIENT NORM TOTAL] 3.4136

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.036
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50676334 0.49323666] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.078
[MASKS] A(Pass/Fail): 717/1331 | B: 638/1410 | C: 393/983
[LOSS Ex1] A: 0.63120 | B: 0.62170 | C: 0.61239
[LOGITS Ex2 A] Mean Abs: 2.174 | Max: 7.632
[LOSS Ex2] A: 0.12193 | B: 0.32388 | C: 0.21333
** [JOINT LOSS] ** : 0.841473
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003471 | Grad Max: 0.143627
  -> Layer: shared_layers.0.bias | Grad Mean: 0.133953 | Grad Max: 0.808312
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002296 | Grad Max: 0.006282
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002417 | Grad Max: 0.002417
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001256 | Grad Max: 0.274503
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022165 | Grad Max: 1.499372
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.005547
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009612 | Grad Max: 0.058210
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000228
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002045 | Grad Max: 0.005321
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000125
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000514 | Grad Max: 0.001557
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000451 | Grad Max: 0.001465
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008085 | Grad Max: 0.008085
[GRADIENT NORM TOTAL] 3.8797

[EPOCH SUMMARY] Train Loss: 0.8477

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8320 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 142/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.856
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013791  0.49862093] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.077
[MASKS] A(Pass/Fail): 689/1359 | B: 639/1409 | C: 576/1472
[LOSS Ex1] A: 0.63989 | B: 0.61728 | C: 0.61280
[LOGITS Ex2 A] Mean Abs: 2.110 | Max: 5.922
[LOSS Ex2] A: 0.11990 | B: 0.31578 | C: 0.22700
** [JOINT LOSS] ** : 0.844212
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004605 | Grad Max: 0.163125
  -> Layer: shared_layers.0.bias | Grad Mean: 0.437420 | Grad Max: 2.238805
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002253 | Grad Max: 0.006579
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010834 | Grad Max: 0.010834
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002806 | Grad Max: 0.613294
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052376 | Grad Max: 3.447569
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.011679
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026874 | Grad Max: 0.147479
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000470
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005277 | Grad Max: 0.010988
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000246
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001304 | Grad Max: 0.003897
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000820 | Grad Max: 0.002257
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021387 | Grad Max: 0.021387
[GRADIENT NORM TOTAL] 10.1440

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.771
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439282 0.4560718] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.074
[MASKS] A(Pass/Fail): 686/1362 | B: 603/1253 | C: 594/1454
[LOSS Ex1] A: 0.64024 | B: 0.62158 | C: 0.61316
[LOGITS Ex2 A] Mean Abs: 2.097 | Max: 6.391
[LOSS Ex2] A: 0.11736 | B: 0.31145 | C: 0.25850
** [JOINT LOSS] ** : 0.854097
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005276 | Grad Max: 0.146606
  -> Layer: shared_layers.0.bias | Grad Mean: 0.397915 | Grad Max: 1.930753
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.005909
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007481 | Grad Max: 0.007481
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002410 | Grad Max: 0.404397
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044924 | Grad Max: 2.257474
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000300 | Grad Max: 0.011910
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024172 | Grad Max: 0.135550
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000394
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004766 | Grad Max: 0.009959
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000246
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001184 | Grad Max: 0.003546
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000768 | Grad Max: 0.002345
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019362 | Grad Max: 0.019362
[GRADIENT NORM TOTAL] 8.2140

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.963
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7839656  0.21603446] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.079
[MASKS] A(Pass/Fail): 750/1298 | B: 650/1398 | C: 549/1499
[LOSS Ex1] A: 0.63394 | B: 0.62097 | C: 0.61820
[LOGITS Ex2 A] Mean Abs: 2.189 | Max: 7.175
[LOSS Ex2] A: 0.12015 | B: 0.33299 | C: 0.23042
** [JOINT LOSS] ** : 0.852223
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004430 | Grad Max: 0.118544
  -> Layer: shared_layers.0.bias | Grad Mean: 0.261972 | Grad Max: 1.234533
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.006076
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002405 | Grad Max: 0.002405
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001958 | Grad Max: 0.243782
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035293 | Grad Max: 1.366867
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.009475
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017768 | Grad Max: 0.096934
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000321
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003686 | Grad Max: 0.007894
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000178
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000923 | Grad Max: 0.002538
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000599 | Grad Max: 0.001586
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014989 | Grad Max: 0.014989
[GRADIENT NORM TOTAL] 5.9790

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.068
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50043523 0.49956477] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.077
[MASKS] A(Pass/Fail): 720/1328 | B: 638/1410 | C: 533/1515
[LOSS Ex1] A: 0.64088 | B: 0.62156 | C: 0.61858
[LOGITS Ex2 A] Mean Abs: 2.194 | Max: 6.294
[LOSS Ex2] A: 0.10162 | B: 0.33019 | C: 0.22394
** [JOINT LOSS] ** : 0.845591
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003047 | Grad Max: 0.096265
  -> Layer: shared_layers.0.bias | Grad Mean: 0.274828 | Grad Max: 1.310618
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.005182
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001005 | Grad Max: 0.001005
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002019 | Grad Max: 0.391989
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036770 | Grad Max: 2.206040
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000201 | Grad Max: 0.009127
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016534 | Grad Max: 0.112661
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000307
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003129 | Grad Max: 0.007497
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000150
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000759 | Grad Max: 0.002277
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000445 | Grad Max: 0.001478
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010597 | Grad Max: 0.010597
[GRADIENT NORM TOTAL] 6.9062

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.753
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.71949035 0.28050962] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.079
[MASKS] A(Pass/Fail): 714/1334 | B: 639/1409 | C: 571/1477
[LOSS Ex1] A: 0.63638 | B: 0.61714 | C: 0.61480
[LOGITS Ex2 A] Mean Abs: 2.153 | Max: 6.647
[LOSS Ex2] A: 0.12283 | B: 0.32187 | C: 0.25100
** [JOINT LOSS] ** : 0.854671
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005473 | Grad Max: 0.147062
  -> Layer: shared_layers.0.bias | Grad Mean: 0.256686 | Grad Max: 1.213346
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.005580
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002524 | Grad Max: 0.002524
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001755 | Grad Max: 0.195491
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031132 | Grad Max: 1.093815
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000199 | Grad Max: 0.007040
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015669 | Grad Max: 0.087480
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000315
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003355 | Grad Max: 0.007280
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000238
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000835 | Grad Max: 0.002638
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000625 | Grad Max: 0.001667
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014421 | Grad Max: 0.014421
[GRADIENT NORM TOTAL] 5.3332

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.870
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63159364 0.3684064 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.079
[MASKS] A(Pass/Fail): 590/1026 | B: 605/1251 | C: 554/1494
[LOSS Ex1] A: 0.63457 | B: 0.62143 | C: 0.61584
[LOGITS Ex2 A] Mean Abs: 2.203 | Max: 7.458
[LOSS Ex2] A: 0.10794 | B: 0.31738 | C: 0.22993
** [JOINT LOSS] ** : 0.842364
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002360 | Grad Max: 0.066393
  -> Layer: shared_layers.0.bias | Grad Mean: 0.156373 | Grad Max: 0.986257
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.005952
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002658 | Grad Max: 0.002658
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001158 | Grad Max: 0.174153
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021044 | Grad Max: 0.971357
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.005631
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010326 | Grad Max: 0.075048
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000218
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002011 | Grad Max: 0.005540
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000133
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000502 | Grad Max: 0.001633
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000368 | Grad Max: 0.001525
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008131 | Grad Max: 0.008131
[GRADIENT NORM TOTAL] 3.7225

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.070
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076785  0.49232146] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.079
[MASKS] A(Pass/Fail): 718/1330 | B: 650/1398 | C: 561/1487
[LOSS Ex1] A: 0.63525 | B: 0.62084 | C: 0.61526
[LOGITS Ex2 A] Mean Abs: 2.233 | Max: 9.441
[LOSS Ex2] A: 0.11615 | B: 0.33384 | C: 0.24046
** [JOINT LOSS] ** : 0.853930
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006371 | Grad Max: 0.206721
  -> Layer: shared_layers.0.bias | Grad Mean: 0.517941 | Grad Max: 2.666577
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.005619
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005203 | Grad Max: 0.005203
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003194 | Grad Max: 0.539667
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058850 | Grad Max: 3.020158
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000375 | Grad Max: 0.012500
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030328 | Grad Max: 0.153466
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000536
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006041 | Grad Max: 0.012772
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000260
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001456 | Grad Max: 0.003973
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000806 | Grad Max: 0.002027
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021565 | Grad Max: 0.021565
[GRADIENT NORM TOTAL] 11.1108

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.009
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50868225 0.49131778] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.079
[MASKS] A(Pass/Fail): 715/1333 | B: 638/1410 | C: 558/1490
[LOSS Ex1] A: 0.63200 | B: 0.62142 | C: 0.61598
[LOGITS Ex2 A] Mean Abs: 2.229 | Max: 6.214
[LOSS Ex2] A: 0.11383 | B: 0.32653 | C: 0.21548
** [JOINT LOSS] ** : 0.841742
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008268 | Grad Max: 0.231180
  -> Layer: shared_layers.0.bias | Grad Mean: 0.603621 | Grad Max: 3.087987
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.006657
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001330 | Grad Max: 0.001330
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003835 | Grad Max: 0.602944
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070376 | Grad Max: 3.392644
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000438 | Grad Max: 0.014064
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035234 | Grad Max: 0.169086
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000599
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007170 | Grad Max: 0.015145
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000283
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001779 | Grad Max: 0.004537
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001042 | Grad Max: 0.002704
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027573 | Grad Max: 0.027573
[GRADIENT NORM TOTAL] 13.0451

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.040
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068745 0.4931255] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.078
[MASKS] A(Pass/Fail): 717/1331 | B: 639/1409 | C: 572/1476
[LOSS Ex1] A: 0.63099 | B: 0.61700 | C: 0.61891
[LOGITS Ex2 A] Mean Abs: 2.181 | Max: 7.171
[LOSS Ex2] A: 0.13182 | B: 0.31576 | C: 0.22584
** [JOINT LOSS] ** : 0.846774
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005702 | Grad Max: 0.176677
  -> Layer: shared_layers.0.bias | Grad Mean: 0.185105 | Grad Max: 0.663919
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002236 | Grad Max: 0.006191
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001186 | Grad Max: 0.001186
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001368 | Grad Max: 0.342803
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023888 | Grad Max: 1.845327
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000121 | Grad Max: 0.004498
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008839 | Grad Max: 0.045928
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000267
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001985 | Grad Max: 0.005157
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000111
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000486 | Grad Max: 0.001337
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000326 | Grad Max: 0.001130
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007098 | Grad Max: 0.007098
[GRADIENT NORM TOTAL] 4.6348

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.860
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013945  0.49860546] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.558 | Std: 0.077
[MASKS] A(Pass/Fail): 689/1359 | B: 606/1250 | C: 553/1495
[LOSS Ex1] A: 0.63970 | B: 0.62129 | C: 0.61370
[LOGITS Ex2 A] Mean Abs: 2.114 | Max: 6.153
[LOSS Ex2] A: 0.10954 | B: 0.33116 | C: 0.21961
** [JOINT LOSS] ** : 0.844997
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006447 | Grad Max: 0.236609
  -> Layer: shared_layers.0.bias | Grad Mean: 0.712238 | Grad Max: 3.263726
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.005917
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002616 | Grad Max: 0.002616
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004623 | Grad Max: 0.796183
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.086576 | Grad Max: 4.436524
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000559 | Grad Max: 0.018500
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.045859 | Grad Max: 0.235296
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000730
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009079 | Grad Max: 0.019840
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000382
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002222 | Grad Max: 0.005982
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001365 | Grad Max: 0.003082
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035083 | Grad Max: 0.035083
[GRADIENT NORM TOTAL] 15.9160

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.775
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439716 0.4560284] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.557 | Std: 0.075
[MASKS] A(Pass/Fail): 686/1362 | B: 650/1398 | C: 569/1479
[LOSS Ex1] A: 0.64006 | B: 0.62071 | C: 0.61231
[LOGITS Ex2 A] Mean Abs: 2.090 | Max: 6.797
[LOSS Ex2] A: 0.12291 | B: 0.36269 | C: 0.21764
** [JOINT LOSS] ** : 0.858773
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011071 | Grad Max: 0.309662
  -> Layer: shared_layers.0.bias | Grad Mean: 0.924026 | Grad Max: 4.098980
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.006230
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010284 | Grad Max: 0.010284
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005982 | Grad Max: 0.939016
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.111283 | Grad Max: 5.225818
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000742 | Grad Max: 0.023866
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.060065 | Grad Max: 0.301664
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.000937
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011993 | Grad Max: 0.023838
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000532
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002967 | Grad Max: 0.008007
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001845 | Grad Max: 0.003990
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047571 | Grad Max: 0.047571
[GRADIENT NORM TOTAL] 20.0928

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.968
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.78499407 0.21500598] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.079
[MASKS] A(Pass/Fail): 750/1298 | B: 638/1410 | C: 618/1430
[LOSS Ex1] A: 0.63375 | B: 0.62129 | C: 0.61183
[LOGITS Ex2 A] Mean Abs: 2.168 | Max: 7.094
[LOSS Ex2] A: 0.10004 | B: 0.34698 | C: 0.22106
** [JOINT LOSS] ** : 0.844981
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005050 | Grad Max: 0.145668
  -> Layer: shared_layers.0.bias | Grad Mean: 0.424204 | Grad Max: 1.898108
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002259 | Grad Max: 0.006464
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004523 | Grad Max: 0.004523
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002581 | Grad Max: 0.465232
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048031 | Grad Max: 2.605263
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000322 | Grad Max: 0.010513
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025793 | Grad Max: 0.133067
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000427
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005110 | Grad Max: 0.010466
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000248
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001264 | Grad Max: 0.003704
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000837 | Grad Max: 0.002452
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020655 | Grad Max: 0.020655
[GRADIENT NORM TOTAL] 9.1085

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50042045 0.49957955] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.078
[MASKS] A(Pass/Fail): 721/1327 | B: 639/1409 | C: 553/1495
[LOSS Ex1] A: 0.64071 | B: 0.61687 | C: 0.61773
[LOGITS Ex2 A] Mean Abs: 2.249 | Max: 6.057
[LOSS Ex2] A: 0.10269 | B: 0.31314 | C: 0.23997
** [JOINT LOSS] ** : 0.843704
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004798 | Grad Max: 0.222319
  -> Layer: shared_layers.0.bias | Grad Mean: 0.567043 | Grad Max: 2.914605
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002113 | Grad Max: 0.005308
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001163 | Grad Max: 0.001163
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003715 | Grad Max: 0.581839
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.068620 | Grad Max: 3.259509
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.015164
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036765 | Grad Max: 0.198725
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000539
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007047 | Grad Max: 0.014896
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000302
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001735 | Grad Max: 0.004813
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000967 | Grad Max: 0.002455
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026791 | Grad Max: 0.026791
[GRADIENT NORM TOTAL] 12.7368

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.757
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7202555 0.2797445] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.079
[MASKS] A(Pass/Fail): 714/1334 | B: 606/1250 | C: 367/1009
[LOSS Ex1] A: 0.63619 | B: 0.62116 | C: 0.62187
[LOGITS Ex2 A] Mean Abs: 2.222 | Max: 5.988
[LOSS Ex2] A: 0.13131 | B: 0.32760 | C: 0.25425
** [JOINT LOSS] ** : 0.864129
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006352 | Grad Max: 0.340114
  -> Layer: shared_layers.0.bias | Grad Mean: 0.873241 | Grad Max: 4.366911
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.006105
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003222 | Grad Max: 0.003222
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005678 | Grad Max: 0.835491
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.105980 | Grad Max: 4.625298
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000686 | Grad Max: 0.024130
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.056781 | Grad Max: 0.304595
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000840
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011166 | Grad Max: 0.022583
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000451
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002797 | Grad Max: 0.007192
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001700 | Grad Max: 0.003355
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045279 | Grad Max: 0.045279
[GRADIENT NORM TOTAL] 19.4901

[EPOCH SUMMARY] Train Loss: 0.8494

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8340 | Alpha: 0.5500
No improve count: 3/15

############################## EPOCH 143/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.874
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6319893  0.36801073] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.079
[MASKS] A(Pass/Fail): 590/1026 | B: 650/1398 | C: 579/1469
[LOSS Ex1] A: 0.63439 | B: 0.62059 | C: 0.61325
[LOGITS Ex2 A] Mean Abs: 2.263 | Max: 9.470
[LOSS Ex2] A: 0.12027 | B: 0.32680 | C: 0.22692
** [JOINT LOSS] ** : 0.847402
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003878 | Grad Max: 0.196128
  -> Layer: shared_layers.0.bias | Grad Mean: 0.447137 | Grad Max: 2.435615
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002154 | Grad Max: 0.006066
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006185 | Grad Max: 0.006185
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002873 | Grad Max: 0.453488
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053203 | Grad Max: 2.487609
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000349 | Grad Max: 0.012325
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028569 | Grad Max: 0.153748
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000492
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005482 | Grad Max: 0.011800
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000250
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001372 | Grad Max: 0.003413
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000792 | Grad Max: 0.002064
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021737 | Grad Max: 0.021737
[GRADIENT NORM TOTAL] 9.9824

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.074
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50767803 0.49232197] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 719/1329 | B: 638/1410 | C: 576/1472
[LOSS Ex1] A: 0.63508 | B: 0.62117 | C: 0.61345
[LOGITS Ex2 A] Mean Abs: 2.191 | Max: 7.661
[LOSS Ex2] A: 0.10531 | B: 0.33198 | C: 0.23288
** [JOINT LOSS] ** : 0.846622
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005782 | Grad Max: 0.235009
  -> Layer: shared_layers.0.bias | Grad Mean: 0.290568 | Grad Max: 1.067762
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.005503
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000675 | Grad Max: 0.000675
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001888 | Grad Max: 0.285805
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033597 | Grad Max: 1.484036
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000224 | Grad Max: 0.008120
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017673 | Grad Max: 0.110226
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000364
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003681 | Grad Max: 0.008367
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000215
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000913 | Grad Max: 0.002593
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000605 | Grad Max: 0.001889
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015167 | Grad Max: 0.015167
[GRADIENT NORM TOTAL] 5.8060

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.013
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5087008  0.49129924] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.079
[MASKS] A(Pass/Fail): 716/1332 | B: 639/1409 | C: 594/1454
[LOSS Ex1] A: 0.63182 | B: 0.61675 | C: 0.61213
[LOGITS Ex2 A] Mean Abs: 2.180 | Max: 6.047
[LOSS Ex2] A: 0.11296 | B: 0.31747 | C: 0.22495
** [JOINT LOSS] ** : 0.838696
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004099 | Grad Max: 0.125846
  -> Layer: shared_layers.0.bias | Grad Mean: 0.364704 | Grad Max: 1.696615
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002338 | Grad Max: 0.006665
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005842 | Grad Max: 0.005842
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002373 | Grad Max: 0.296586
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043667 | Grad Max: 1.633076
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000265 | Grad Max: 0.009177
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021793 | Grad Max: 0.119705
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000426
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004340 | Grad Max: 0.009781
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000218
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001057 | Grad Max: 0.003143
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000638 | Grad Max: 0.001925
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016848 | Grad Max: 0.016848
[GRADIENT NORM TOTAL] 7.7406

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.044
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50685906 0.4931409 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.079
[MASKS] A(Pass/Fail): 717/1331 | B: 606/1250 | C: 583/1465
[LOSS Ex1] A: 0.63082 | B: 0.62104 | C: 0.61128
[LOGITS Ex2 A] Mean Abs: 2.173 | Max: 6.900
[LOSS Ex2] A: 0.12290 | B: 0.31469 | C: 0.23929
** [JOINT LOSS] ** : 0.846676
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004037 | Grad Max: 0.161669
  -> Layer: shared_layers.0.bias | Grad Mean: 0.082230 | Grad Max: 0.312750
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002243 | Grad Max: 0.006382
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000799 | Grad Max: 0.000799
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000893 | Grad Max: 0.143716
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013901 | Grad Max: 0.779484
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.004542
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002646 | Grad Max: 0.040794
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000170
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000356 | Grad Max: 0.002433
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000060
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000095 | Grad Max: 0.000727
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000257 | Grad Max: 0.000895
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002208 | Grad Max: 0.002208
[GRADIENT NORM TOTAL] 2.7070

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.863
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50129867 0.49870133] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.078
[MASKS] A(Pass/Fail): 689/1359 | B: 650/1398 | C: 545/1503
[LOSS Ex1] A: 0.63955 | B: 0.62047 | C: 0.62618
[LOGITS Ex2 A] Mean Abs: 2.176 | Max: 5.621
[LOSS Ex2] A: 0.10985 | B: 0.32340 | C: 0.24922
** [JOINT LOSS] ** : 0.856226
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005175 | Grad Max: 0.136092
  -> Layer: shared_layers.0.bias | Grad Mean: 0.410312 | Grad Max: 1.532676
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001955 | Grad Max: 0.005621
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008732 | Grad Max: 0.008732
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002587 | Grad Max: 0.353919
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048161 | Grad Max: 1.965453
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000320 | Grad Max: 0.013402
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026180 | Grad Max: 0.161150
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000468
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005219 | Grad Max: 0.011552
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000259
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001266 | Grad Max: 0.003825
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000721 | Grad Max: 0.001988
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018919 | Grad Max: 0.018919
[GRADIENT NORM TOTAL] 8.3000

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.778
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5440374 0.4559626] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.075
[MASKS] A(Pass/Fail): 686/1362 | B: 639/1409 | C: 550/1498
[LOSS Ex1] A: 0.63992 | B: 0.62105 | C: 0.61321
[LOGITS Ex2 A] Mean Abs: 2.143 | Max: 6.034
[LOSS Ex2] A: 0.12260 | B: 0.33050 | C: 0.24092
** [JOINT LOSS] ** : 0.856065
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002784 | Grad Max: 0.069253
  -> Layer: shared_layers.0.bias | Grad Mean: 0.189135 | Grad Max: 0.756457
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002109 | Grad Max: 0.005503
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006228 | Grad Max: 0.006228
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001560 | Grad Max: 0.351085
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028347 | Grad Max: 1.983019
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000150 | Grad Max: 0.007668
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012186 | Grad Max: 0.076808
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000244
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002294 | Grad Max: 0.005894
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000121
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000570 | Grad Max: 0.001746
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000310 | Grad Max: 0.001181
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008140 | Grad Max: 0.008140
[GRADIENT NORM TOTAL] 5.3251

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.971
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.78595364 0.21404637] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.079
[MASKS] A(Pass/Fail): 751/1297 | B: 639/1409 | C: 568/1480
[LOSS Ex1] A: 0.63358 | B: 0.61663 | C: 0.61497
[LOGITS Ex2 A] Mean Abs: 2.139 | Max: 5.629
[LOSS Ex2] A: 0.11316 | B: 0.31185 | C: 0.20976
** [JOINT LOSS] ** : 0.833317
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005037 | Grad Max: 0.159678
  -> Layer: shared_layers.0.bias | Grad Mean: 0.400216 | Grad Max: 2.322187
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.005988
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004832 | Grad Max: 0.004832
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002536 | Grad Max: 0.620209
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046682 | Grad Max: 3.440555
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000272 | Grad Max: 0.008869
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022164 | Grad Max: 0.108927
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000435
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004515 | Grad Max: 0.010206
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000241
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001103 | Grad Max: 0.003354
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000663 | Grad Max: 0.002364
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016859 | Grad Max: 0.016859
[GRADIENT NORM TOTAL] 9.7567

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.077
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004798  0.49952024] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.078
[MASKS] A(Pass/Fail): 721/1327 | B: 606/1250 | C: 566/1482
[LOSS Ex1] A: 0.64056 | B: 0.62091 | C: 0.61646
[LOGITS Ex2 A] Mean Abs: 2.135 | Max: 6.095
[LOSS Ex2] A: 0.10156 | B: 0.31844 | C: 0.22864
** [JOINT LOSS] ** : 0.842193
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006192 | Grad Max: 0.170674
  -> Layer: shared_layers.0.bias | Grad Mean: 0.486411 | Grad Max: 2.204992
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.005651
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003739 | Grad Max: 0.003739
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003047 | Grad Max: 0.702339
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055576 | Grad Max: 3.922605
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000352 | Grad Max: 0.011455
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028663 | Grad Max: 0.146542
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000505
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005820 | Grad Max: 0.012846
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000290
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001438 | Grad Max: 0.004129
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000869 | Grad Max: 0.002528
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022444 | Grad Max: 0.022444
[GRADIENT NORM TOTAL] 10.6624

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.760
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.72100353 0.2789965 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.079
[MASKS] A(Pass/Fail): 715/1333 | B: 651/1397 | C: 547/1501
[LOSS Ex1] A: 0.63603 | B: 0.62035 | C: 0.61510
[LOGITS Ex2 A] Mean Abs: 2.137 | Max: 6.528
[LOSS Ex2] A: 0.12596 | B: 0.32925 | C: 0.22882
** [JOINT LOSS] ** : 0.851835
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004086 | Grad Max: 0.124351
  -> Layer: shared_layers.0.bias | Grad Mean: 0.125706 | Grad Max: 0.666765
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.006038
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003596 | Grad Max: 0.003596
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000899 | Grad Max: 0.128382
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015166 | Grad Max: 0.671980
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.003442
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004454 | Grad Max: 0.032305
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000200
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001042 | Grad Max: 0.003578
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000087
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000260 | Grad Max: 0.001043
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000303 | Grad Max: 0.001340
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004836 | Grad Max: 0.004836
[GRADIENT NORM TOTAL] 2.8002

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.878
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63233 0.36767] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.079
[MASKS] A(Pass/Fail): 590/1026 | B: 639/1409 | C: 542/1506
[LOSS Ex1] A: 0.63422 | B: 0.62091 | C: 0.61889
[LOGITS Ex2 A] Mean Abs: 2.230 | Max: 10.109
[LOSS Ex2] A: 0.10968 | B: 0.34092 | C: 0.26550
** [JOINT LOSS] ** : 0.863376
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007658 | Grad Max: 0.266868
  -> Layer: shared_layers.0.bias | Grad Mean: 0.650440 | Grad Max: 3.560274
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.007068
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.015286 | Grad Max: 0.015286
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004058 | Grad Max: 0.766253
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.075395 | Grad Max: 4.276077
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000454 | Grad Max: 0.015786
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037061 | Grad Max: 0.183044
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000616
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007394 | Grad Max: 0.015489
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000331
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001798 | Grad Max: 0.005130
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001009 | Grad Max: 0.002211
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026531 | Grad Max: 0.026531
[GRADIENT NORM TOTAL] 14.6233

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.079
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50764006 0.49235994] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 719/1329 | B: 639/1409 | C: 532/1516
[LOSS Ex1] A: 0.63490 | B: 0.61649 | C: 0.61880
[LOGITS Ex2 A] Mean Abs: 2.228 | Max: 7.319
[LOSS Ex2] A: 0.11030 | B: 0.32891 | C: 0.23689
** [JOINT LOSS] ** : 0.848766
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010396 | Grad Max: 0.312777
  -> Layer: shared_layers.0.bias | Grad Mean: 0.837695 | Grad Max: 4.114371
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.005773
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000999 | Grad Max: 0.000999
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005239 | Grad Max: 0.909402
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.096244 | Grad Max: 5.069708
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000608 | Grad Max: 0.019343
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.049728 | Grad Max: 0.252582
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000767
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010196 | Grad Max: 0.020576
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000432
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002555 | Grad Max: 0.006680
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001509 | Grad Max: 0.003226
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040459 | Grad Max: 0.040459
[GRADIENT NORM TOTAL] 18.2585

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.017
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5086216 0.4913784] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.079
[MASKS] A(Pass/Fail): 716/1332 | B: 606/1250 | C: 563/1485
[LOSS Ex1] A: 0.63164 | B: 0.62078 | C: 0.61174
[LOGITS Ex2 A] Mean Abs: 2.189 | Max: 7.132
[LOSS Ex2] A: 0.11788 | B: 0.30774 | C: 0.21658
** [JOINT LOSS] ** : 0.835453
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007637 | Grad Max: 0.201215
  -> Layer: shared_layers.0.bias | Grad Mean: 0.322263 | Grad Max: 1.522661
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006834
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004150 | Grad Max: 0.004150
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.420166
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039232 | Grad Max: 2.340578
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000247 | Grad Max: 0.007775
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019293 | Grad Max: 0.103066
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000385
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004121 | Grad Max: 0.008513
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000193
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001090 | Grad Max: 0.002916
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000708 | Grad Max: 0.002142
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018613 | Grad Max: 0.018613
[GRADIENT NORM TOTAL] 7.2260

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.049
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50694525 0.49305472] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.079
[MASKS] A(Pass/Fail): 718/1330 | B: 651/1397 | C: 597/1451
[LOSS Ex1] A: 0.63064 | B: 0.62021 | C: 0.61482
[LOGITS Ex2 A] Mean Abs: 2.119 | Max: 6.024
[LOSS Ex2] A: 0.12182 | B: 0.35800 | C: 0.22645
** [JOINT LOSS] ** : 0.857314
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005580 | Grad Max: 0.218806
  -> Layer: shared_layers.0.bias | Grad Mean: 0.577120 | Grad Max: 2.928868
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002243 | Grad Max: 0.006485
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001076 | Grad Max: 0.001076
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003815 | Grad Max: 0.536136
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069176 | Grad Max: 2.974916
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.015248
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033994 | Grad Max: 0.202156
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000567
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006288 | Grad Max: 0.013492
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000271
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001565 | Grad Max: 0.004447
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000908 | Grad Max: 0.002429
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025589 | Grad Max: 0.025589
[GRADIENT NORM TOTAL] 13.0139

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.867
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012965  0.49870348] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.078
[MASKS] A(Pass/Fail): 689/1359 | B: 639/1409 | C: 397/979
[LOSS Ex1] A: 0.63938 | B: 0.62079 | C: 0.61068
[LOGITS Ex2 A] Mean Abs: 2.073 | Max: 6.309
[LOSS Ex2] A: 0.11391 | B: 0.36642 | C: 0.21314
** [JOINT LOSS] ** : 0.854772
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005810 | Grad Max: 0.291340
  -> Layer: shared_layers.0.bias | Grad Mean: 0.844417 | Grad Max: 3.924367
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.006459
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011998 | Grad Max: 0.011998
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005474 | Grad Max: 0.816300
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.102739 | Grad Max: 4.595547
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000669 | Grad Max: 0.025557
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.056064 | Grad Max: 0.317720
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000804
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011032 | Grad Max: 0.022531
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000468
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002729 | Grad Max: 0.007261
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001556 | Grad Max: 0.003464
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042838 | Grad Max: 0.042838
[GRADIENT NORM TOTAL] 19.0797

[EPOCH SUMMARY] Train Loss: 0.8485

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8357 | Alpha: 0.5500
No improve count: 4/15

############################## EPOCH 144/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.781
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439925  0.45600754] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.075
[MASKS] A(Pass/Fail): 686/1362 | B: 640/1408 | C: 568/1480
[LOSS Ex1] A: 0.63976 | B: 0.61637 | C: 0.61277
[LOGITS Ex2 A] Mean Abs: 2.069 | Max: 6.109
[LOSS Ex2] A: 0.11853 | B: 0.33169 | C: 0.22148
** [JOINT LOSS] ** : 0.846867
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003979 | Grad Max: 0.209672
  -> Layer: shared_layers.0.bias | Grad Mean: 0.559466 | Grad Max: 2.810719
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.005485
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003891 | Grad Max: 0.003891
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003648 | Grad Max: 0.630922
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.068159 | Grad Max: 3.519400
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000426 | Grad Max: 0.018429
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035578 | Grad Max: 0.227594
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000483
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006965 | Grad Max: 0.014082
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000329
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001711 | Grad Max: 0.004822
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000993 | Grad Max: 0.002527
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027106 | Grad Max: 0.027106
[GRADIENT NORM TOTAL] 12.9580

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.975
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7867267  0.21327327] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.079
[MASKS] A(Pass/Fail): 751/1297 | B: 606/1250 | C: 544/1504
[LOSS Ex1] A: 0.63342 | B: 0.62065 | C: 0.61964
[LOGITS Ex2 A] Mean Abs: 2.153 | Max: 6.684
[LOSS Ex2] A: 0.11276 | B: 0.31610 | C: 0.23583
** [JOINT LOSS] ** : 0.846135
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005200 | Grad Max: 0.144395
  -> Layer: shared_layers.0.bias | Grad Mean: 0.227397 | Grad Max: 1.022115
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.005805
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004487 | Grad Max: 0.004487
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001686 | Grad Max: 0.306162
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030595 | Grad Max: 1.572039
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000167 | Grad Max: 0.007300
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012891 | Grad Max: 0.081571
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000266
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002707 | Grad Max: 0.006267
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000144
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000700 | Grad Max: 0.001964
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000452 | Grad Max: 0.001547
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010929 | Grad Max: 0.010929
[GRADIENT NORM TOTAL] 5.4449

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.081
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50043803 0.49956197] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.078
[MASKS] A(Pass/Fail): 721/1327 | B: 651/1397 | C: 592/1456
[LOSS Ex1] A: 0.64041 | B: 0.62010 | C: 0.61021
[LOGITS Ex2 A] Mean Abs: 2.204 | Max: 5.631
[LOSS Ex2] A: 0.09986 | B: 0.33256 | C: 0.23054
** [JOINT LOSS] ** : 0.844563
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004618 | Grad Max: 0.212585
  -> Layer: shared_layers.0.bias | Grad Mean: 0.508400 | Grad Max: 2.679003
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.005896
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001412 | Grad Max: 0.001412
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003147 | Grad Max: 0.479446
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058960 | Grad Max: 2.678427
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.016286
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031801 | Grad Max: 0.192844
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000478
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006137 | Grad Max: 0.012487
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000256
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001538 | Grad Max: 0.004000
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000860 | Grad Max: 0.002462
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024046 | Grad Max: 0.024046
[GRADIENT NORM TOTAL] 11.1311

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.763
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7214717  0.27852824] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.079
[MASKS] A(Pass/Fail): 715/1333 | B: 639/1409 | C: 565/1483
[LOSS Ex1] A: 0.63588 | B: 0.62067 | C: 0.61614
[LOGITS Ex2 A] Mean Abs: 2.147 | Max: 6.414
[LOSS Ex2] A: 0.12112 | B: 0.33245 | C: 0.23543
** [JOINT LOSS] ** : 0.853895
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004194 | Grad Max: 0.140329
  -> Layer: shared_layers.0.bias | Grad Mean: 0.185621 | Grad Max: 0.922879
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.006331
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012276 | Grad Max: 0.012276
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001353 | Grad Max: 0.224931
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023019 | Grad Max: 1.212999
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.005403
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005925 | Grad Max: 0.051805
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000171
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000914 | Grad Max: 0.003278
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000080
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000221 | Grad Max: 0.001030
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000336 | Grad Max: 0.000848
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002821 | Grad Max: 0.002821
[GRADIENT NORM TOTAL] 4.3975

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.881
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6326799 0.3673201] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.079
[MASKS] A(Pass/Fail): 590/1026 | B: 640/1408 | C: 528/1520
[LOSS Ex1] A: 0.63407 | B: 0.61626 | C: 0.61937
[LOGITS Ex2 A] Mean Abs: 2.172 | Max: 8.857
[LOSS Ex2] A: 0.11542 | B: 0.31576 | C: 0.24602
** [JOINT LOSS] ** : 0.848965
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006626 | Grad Max: 0.184106
  -> Layer: shared_layers.0.bias | Grad Mean: 0.458099 | Grad Max: 1.768980
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002174 | Grad Max: 0.005952
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008004 | Grad Max: 0.008004
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002894 | Grad Max: 0.355227
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052936 | Grad Max: 1.972973
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000353 | Grad Max: 0.010628
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028861 | Grad Max: 0.136742
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000465
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005965 | Grad Max: 0.013031
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000268
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001506 | Grad Max: 0.004101
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000949 | Grad Max: 0.002343
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024775 | Grad Max: 0.024775
[GRADIENT NORM TOTAL] 9.3232

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.082
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50766796 0.49233207] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.080
[MASKS] A(Pass/Fail): 719/1329 | B: 606/1250 | C: 576/1472
[LOSS Ex1] A: 0.63475 | B: 0.62053 | C: 0.60945
[LOGITS Ex2 A] Mean Abs: 2.146 | Max: 8.388
[LOSS Ex2] A: 0.10034 | B: 0.32202 | C: 0.22818
** [JOINT LOSS] ** : 0.838425
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005399 | Grad Max: 0.179016
  -> Layer: shared_layers.0.bias | Grad Mean: 0.375441 | Grad Max: 1.640949
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002207 | Grad Max: 0.006018
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001419 | Grad Max: 0.001419
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002400 | Grad Max: 0.601190
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044207 | Grad Max: 3.350138
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000268 | Grad Max: 0.010005
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021772 | Grad Max: 0.121679
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000408
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004437 | Grad Max: 0.009814
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000218
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001094 | Grad Max: 0.003047
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000680 | Grad Max: 0.002107
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017152 | Grad Max: 0.017152
[GRADIENT NORM TOTAL] 8.4885

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.021
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50858766 0.4914123 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.079
[MASKS] A(Pass/Fail): 716/1332 | B: 651/1397 | C: 553/1495
[LOSS Ex1] A: 0.63149 | B: 0.61999 | C: 0.61499
[LOGITS Ex2 A] Mean Abs: 2.160 | Max: 6.426
[LOSS Ex2] A: 0.10788 | B: 0.32937 | C: 0.22653
** [JOINT LOSS] ** : 0.843415
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002220 | Grad Max: 0.081100
  -> Layer: shared_layers.0.bias | Grad Mean: 0.151681 | Grad Max: 0.824321
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002312 | Grad Max: 0.006854
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007855 | Grad Max: 0.007855
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001165 | Grad Max: 0.226960
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021225 | Grad Max: 1.256246
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000139 | Grad Max: 0.005615
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011303 | Grad Max: 0.069698
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000241
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002255 | Grad Max: 0.005452
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000137
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000570 | Grad Max: 0.001816
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000431 | Grad Max: 0.001521
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009246 | Grad Max: 0.009246
[GRADIENT NORM TOTAL] 3.8918

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.052
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50698876 0.4930112 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.079
[MASKS] A(Pass/Fail): 718/1330 | B: 639/1409 | C: 557/1491
[LOSS Ex1] A: 0.63048 | B: 0.62056 | C: 0.61263
[LOGITS Ex2 A] Mean Abs: 2.147 | Max: 8.405
[LOSS Ex2] A: 0.13247 | B: 0.32894 | C: 0.21181
** [JOINT LOSS] ** : 0.845633
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004217 | Grad Max: 0.150206
  -> Layer: shared_layers.0.bias | Grad Mean: 0.221224 | Grad Max: 0.996747
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.006250
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003137 | Grad Max: 0.003137
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001661 | Grad Max: 0.277787
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029116 | Grad Max: 1.552037
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000176 | Grad Max: 0.005775
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014124 | Grad Max: 0.080384
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000322
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003062 | Grad Max: 0.006877
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000156
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000783 | Grad Max: 0.002120
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000469 | Grad Max: 0.001667
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012237 | Grad Max: 0.012237
[GRADIENT NORM TOTAL] 5.2845

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.870
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012607 0.4987393] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.078
[MASKS] A(Pass/Fail): 689/1359 | B: 640/1408 | C: 541/1507
[LOSS Ex1] A: 0.63923 | B: 0.61614 | C: 0.61795
[LOGITS Ex2 A] Mean Abs: 2.093 | Max: 7.001
[LOSS Ex2] A: 0.11255 | B: 0.31356 | C: 0.23151
** [JOINT LOSS] ** : 0.843648
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003358 | Grad Max: 0.111901
  -> Layer: shared_layers.0.bias | Grad Mean: 0.319543 | Grad Max: 1.600701
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005496
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003189 | Grad Max: 0.003189
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.555696
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038414 | Grad Max: 3.117706
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000225 | Grad Max: 0.009133
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018602 | Grad Max: 0.109195
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000317
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003554 | Grad Max: 0.008189
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000181
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000873 | Grad Max: 0.002636
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000547 | Grad Max: 0.001956
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014023 | Grad Max: 0.014023
[GRADIENT NORM TOTAL] 7.9188

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.783
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.543993   0.45600697] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.075
[MASKS] A(Pass/Fail): 686/1362 | B: 606/1250 | C: 582/1466
[LOSS Ex1] A: 0.63961 | B: 0.62041 | C: 0.61488
[LOGITS Ex2 A] Mean Abs: 2.078 | Max: 5.666
[LOSS Ex2] A: 0.11698 | B: 0.31869 | C: 0.22577
** [JOINT LOSS] ** : 0.845446
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004027 | Grad Max: 0.129699
  -> Layer: shared_layers.0.bias | Grad Mean: 0.388502 | Grad Max: 1.756773
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005624
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001101 | Grad Max: 0.001101
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002358 | Grad Max: 0.418619
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043948 | Grad Max: 2.344821
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000295 | Grad Max: 0.011931
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024383 | Grad Max: 0.153657
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000408
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004756 | Grad Max: 0.010466
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000239
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001163 | Grad Max: 0.003541
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000666 | Grad Max: 0.002064
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017548 | Grad Max: 0.017548
[GRADIENT NORM TOTAL] 8.2558

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.979
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7875203  0.21247965] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.079
[MASKS] A(Pass/Fail): 751/1297 | B: 651/1397 | C: 610/1438
[LOSS Ex1] A: 0.63326 | B: 0.61987 | C: 0.60970
[LOGITS Ex2 A] Mean Abs: 2.163 | Max: 6.467
[LOSS Ex2] A: 0.10829 | B: 0.34245 | C: 0.22195
** [JOINT LOSS] ** : 0.845176
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003433 | Grad Max: 0.135451
  -> Layer: shared_layers.0.bias | Grad Mean: 0.226094 | Grad Max: 1.378317
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002280 | Grad Max: 0.006700
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005872 | Grad Max: 0.005872
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001610 | Grad Max: 0.304799
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029242 | Grad Max: 1.677679
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000156 | Grad Max: 0.007844
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012831 | Grad Max: 0.085776
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000264
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002632 | Grad Max: 0.005910
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000132
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000666 | Grad Max: 0.001785
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.001453
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010287 | Grad Max: 0.010287
[GRADIENT NORM TOTAL] 5.6282

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.085
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004259  0.49957415] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.078
[MASKS] A(Pass/Fail): 721/1327 | B: 639/1409 | C: 587/1461
[LOSS Ex1] A: 0.64025 | B: 0.62044 | C: 0.61504
[LOGITS Ex2 A] Mean Abs: 2.176 | Max: 6.013
[LOSS Ex2] A: 0.10085 | B: 0.32193 | C: 0.24132
** [JOINT LOSS] ** : 0.846605
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002486 | Grad Max: 0.097018
  -> Layer: shared_layers.0.bias | Grad Mean: 0.239905 | Grad Max: 1.301092
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.005567
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003670 | Grad Max: 0.003670
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001596 | Grad Max: 0.351888
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029354 | Grad Max: 1.981730
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000154 | Grad Max: 0.008158
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012542 | Grad Max: 0.080395
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000208
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002357 | Grad Max: 0.006111
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000121
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000562 | Grad Max: 0.001679
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000388 | Grad Max: 0.001289
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007575 | Grad Max: 0.007575
[GRADIENT NORM TOTAL] 5.7227

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.766
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7220352  0.27796477] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.079
[MASKS] A(Pass/Fail): 715/1333 | B: 640/1408 | C: 560/1488
[LOSS Ex1] A: 0.63571 | B: 0.61601 | C: 0.61360
[LOGITS Ex2 A] Mean Abs: 2.152 | Max: 6.901
[LOSS Ex2] A: 0.12627 | B: 0.31520 | C: 0.20342
** [JOINT LOSS] ** : 0.836737
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004545 | Grad Max: 0.117885
  -> Layer: shared_layers.0.bias | Grad Mean: 0.201600 | Grad Max: 1.006311
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002194 | Grad Max: 0.005781
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008244 | Grad Max: 0.008244
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001432 | Grad Max: 0.199591
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025797 | Grad Max: 1.096842
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000178 | Grad Max: 0.005440
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014276 | Grad Max: 0.067627
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000298
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002894 | Grad Max: 0.006897
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000169
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000697 | Grad Max: 0.002255
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000464 | Grad Max: 0.001797
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010717 | Grad Max: 0.010717
[GRADIENT NORM TOTAL] 4.3756

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.885
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6329312  0.36706886] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.080
[MASKS] A(Pass/Fail): 590/1026 | B: 606/1250 | C: 392/984
[LOSS Ex1] A: 0.63390 | B: 0.62027 | C: 0.62253
[LOGITS Ex2 A] Mean Abs: 2.207 | Max: 8.630
[LOSS Ex2] A: 0.10997 | B: 0.31154 | C: 0.24962
** [JOINT LOSS] ** : 0.849277
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002866 | Grad Max: 0.045399
  -> Layer: shared_layers.0.bias | Grad Mean: 0.109826 | Grad Max: 0.568672
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.006154
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011889 | Grad Max: 0.011890
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000820 | Grad Max: 0.435877
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013870 | Grad Max: 2.442509
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.003463
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002624 | Grad Max: 0.029198
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000141
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000452 | Grad Max: 0.002617
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000091
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000116 | Grad Max: 0.001077
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000298 | Grad Max: 0.000967
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001418 | Grad Max: 0.001418
[GRADIENT NORM TOTAL] 3.8524

[EPOCH SUMMARY] Train Loss: 0.8453

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8277 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8312 -> New: 0.8277)

############################## EPOCH 145/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.087
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076589 0.4923411] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.080
[MASKS] A(Pass/Fail): 720/1328 | B: 651/1397 | C: 517/1531
[LOSS Ex1] A: 0.63458 | B: 0.61973 | C: 0.61947
[LOGITS Ex2 A] Mean Abs: 2.212 | Max: 7.610
[LOSS Ex2] A: 0.10750 | B: 0.33266 | C: 0.23096
** [JOINT LOSS] ** : 0.848298
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003898 | Grad Max: 0.187776
  -> Layer: shared_layers.0.bias | Grad Mean: 0.357691 | Grad Max: 2.182126
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.005430
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000830 | Grad Max: 0.000830
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002248 | Grad Max: 0.396512
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041631 | Grad Max: 2.209554
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000279 | Grad Max: 0.009751
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023144 | Grad Max: 0.119491
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000372
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004564 | Grad Max: 0.009989
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000213
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001131 | Grad Max: 0.003079
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000634 | Grad Max: 0.001925
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017167 | Grad Max: 0.017167
[GRADIENT NORM TOTAL] 7.8711

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.025
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50850946 0.49149057] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 716/1332 | B: 639/1409 | C: 546/1502
[LOSS Ex1] A: 0.63131 | B: 0.62029 | C: 0.61715
[LOGITS Ex2 A] Mean Abs: 2.196 | Max: 5.574
[LOSS Ex2] A: 0.11435 | B: 0.31093 | C: 0.24369
** [JOINT LOSS] ** : 0.845905
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002667 | Grad Max: 0.059048
  -> Layer: shared_layers.0.bias | Grad Mean: 0.131611 | Grad Max: 0.653112
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.006204
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001624 | Grad Max: 0.001624
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001168 | Grad Max: 0.491435
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021037 | Grad Max: 2.731622
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.005102
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007977 | Grad Max: 0.054382
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001629 | Grad Max: 0.004547
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000109
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000385 | Grad Max: 0.001418
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000354 | Grad Max: 0.001037
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004718 | Grad Max: 0.004718
[GRADIENT NORM TOTAL] 4.9785

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.056
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50704795 0.49295205] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.079
[MASKS] A(Pass/Fail): 718/1330 | B: 640/1408 | C: 574/1474
[LOSS Ex1] A: 0.63030 | B: 0.61587 | C: 0.61323
[LOGITS Ex2 A] Mean Abs: 2.115 | Max: 8.705
[LOSS Ex2] A: 0.12026 | B: 0.32734 | C: 0.23074
** [JOINT LOSS] ** : 0.845911
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004393 | Grad Max: 0.171797
  -> Layer: shared_layers.0.bias | Grad Mean: 0.457066 | Grad Max: 2.096837
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.006440
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003439 | Grad Max: 0.003439
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002843 | Grad Max: 0.435505
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052842 | Grad Max: 2.398676
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000320 | Grad Max: 0.013678
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026583 | Grad Max: 0.182173
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000397
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005220 | Grad Max: 0.010378
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000234
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001341 | Grad Max: 0.003580
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000794 | Grad Max: 0.002093
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021861 | Grad Max: 0.021861
[GRADIENT NORM TOTAL] 9.7300

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.873
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012327  0.49876732] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.078
[MASKS] A(Pass/Fail): 689/1359 | B: 606/1250 | C: 560/1488
[LOSS Ex1] A: 0.63905 | B: 0.62012 | C: 0.61501
[LOGITS Ex2 A] Mean Abs: 2.126 | Max: 6.549
[LOSS Ex2] A: 0.10718 | B: 0.31615 | C: 0.21506
** [JOINT LOSS] ** : 0.837522
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004129 | Grad Max: 0.133722
  -> Layer: shared_layers.0.bias | Grad Mean: 0.427546 | Grad Max: 1.781581
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005675
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005884 | Grad Max: 0.005884
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002811 | Grad Max: 0.431505
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052323 | Grad Max: 2.447614
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.011056
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027558 | Grad Max: 0.151617
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000472
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005504 | Grad Max: 0.012200
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000280
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001359 | Grad Max: 0.004077
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000787 | Grad Max: 0.002398
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020542 | Grad Max: 0.020542
[GRADIENT NORM TOTAL] 9.3527

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.787
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54398817 0.45601183] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.075
[MASKS] A(Pass/Fail): 686/1362 | B: 651/1397 | C: 582/1466
[LOSS Ex1] A: 0.63944 | B: 0.61959 | C: 0.61456
[LOGITS Ex2 A] Mean Abs: 2.148 | Max: 6.174
[LOSS Ex2] A: 0.11727 | B: 0.33097 | C: 0.27193
** [JOINT LOSS] ** : 0.864590
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003756 | Grad Max: 0.132255
  -> Layer: shared_layers.0.bias | Grad Mean: 0.269843 | Grad Max: 1.464830
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.005306
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001652 | Grad Max: 0.001652
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001926 | Grad Max: 0.494787
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035569 | Grad Max: 2.762136
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.007671
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016481 | Grad Max: 0.088233
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000321
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003187 | Grad Max: 0.007875
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000757 | Grad Max: 0.002821
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000377 | Grad Max: 0.001115
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010524 | Grad Max: 0.010524
[GRADIENT NORM TOTAL] 7.1540

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.982
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7884864 0.2115136] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.080
[MASKS] A(Pass/Fail): 751/1297 | B: 639/1409 | C: 620/1428
[LOSS Ex1] A: 0.63308 | B: 0.62015 | C: 0.60858
[LOGITS Ex2 A] Mean Abs: 2.191 | Max: 6.591
[LOSS Ex2] A: 0.11063 | B: 0.33823 | C: 0.23122
** [JOINT LOSS] ** : 0.847299
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005195 | Grad Max: 0.165205
  -> Layer: shared_layers.0.bias | Grad Mean: 0.445922 | Grad Max: 2.195778
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.006435
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001914 | Grad Max: 0.001914
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002876 | Grad Max: 0.657604
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053177 | Grad Max: 3.663275
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.012129
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026110 | Grad Max: 0.165184
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000439
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005179 | Grad Max: 0.011238
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000244
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001294 | Grad Max: 0.003468
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000702 | Grad Max: 0.002055
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019176 | Grad Max: 0.019176
[GRADIENT NORM TOTAL] 10.3242

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.089
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003996 0.4996004] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.079
[MASKS] A(Pass/Fail): 721/1327 | B: 640/1408 | C: 562/1486
[LOSS Ex1] A: 0.64008 | B: 0.61573 | C: 0.61516
[LOGITS Ex2 A] Mean Abs: 2.181 | Max: 6.313
[LOSS Ex2] A: 0.09786 | B: 0.31362 | C: 0.22831
** [JOINT LOSS] ** : 0.836919
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002645 | Grad Max: 0.064875
  -> Layer: shared_layers.0.bias | Grad Mean: 0.121638 | Grad Max: 0.481712
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.005552
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002551 | Grad Max: 0.002551
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000911 | Grad Max: 0.427436
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015601 | Grad Max: 2.353610
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.003228
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002480 | Grad Max: 0.022084
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000172
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000553 | Grad Max: 0.002880
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000065
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000150 | Grad Max: 0.000671
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000422 | Grad Max: 0.001157
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002073 | Grad Max: 0.002073
[GRADIENT NORM TOTAL] 4.5116

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.769
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.72273517 0.2772648 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.080
[MASKS] A(Pass/Fail): 714/1334 | B: 606/1250 | C: 566/1482
[LOSS Ex1] A: 0.63551 | B: 0.61997 | C: 0.61616
[LOGITS Ex2 A] Mean Abs: 2.124 | Max: 6.159
[LOSS Ex2] A: 0.13097 | B: 0.32159 | C: 0.23019
** [JOINT LOSS] ** : 0.851467
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006062 | Grad Max: 0.160743
  -> Layer: shared_layers.0.bias | Grad Mean: 0.368605 | Grad Max: 1.865063
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.006031
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003941 | Grad Max: 0.003941
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002291 | Grad Max: 0.239275
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041996 | Grad Max: 1.323700
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.011356
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024750 | Grad Max: 0.136553
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000436
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004995 | Grad Max: 0.010863
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000282
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001241 | Grad Max: 0.003900
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000801 | Grad Max: 0.002331
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019945 | Grad Max: 0.019945
[GRADIENT NORM TOTAL] 7.1375

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.889
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6332908  0.36670914] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.080
[MASKS] A(Pass/Fail): 590/1026 | B: 651/1397 | C: 580/1468
[LOSS Ex1] A: 0.63370 | B: 0.61945 | C: 0.61182
[LOGITS Ex2 A] Mean Abs: 2.212 | Max: 7.305
[LOSS Ex2] A: 0.10935 | B: 0.32683 | C: 0.21829
** [JOINT LOSS] ** : 0.839812
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002474 | Grad Max: 0.061440
  -> Layer: shared_layers.0.bias | Grad Mean: 0.089233 | Grad Max: 0.463665
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002154 | Grad Max: 0.006390
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005964 | Grad Max: 0.005964
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000624 | Grad Max: 0.182248
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010429 | Grad Max: 0.994864
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.003369
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.001933 | Grad Max: 0.016696
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000120
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000339 | Grad Max: 0.002066
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000069
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000571
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000387 | Grad Max: 0.001303
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001083 | Grad Max: 0.001083
[GRADIENT NORM TOTAL] 2.3812

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.091
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5077049  0.49229512] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.080
[MASKS] A(Pass/Fail): 720/1328 | B: 640/1408 | C: 590/1458
[LOSS Ex1] A: 0.63438 | B: 0.62000 | C: 0.61237
[LOGITS Ex2 A] Mean Abs: 2.210 | Max: 7.668
[LOSS Ex2] A: 0.10443 | B: 0.32732 | C: 0.22212
** [JOINT LOSS] ** : 0.840210
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004381 | Grad Max: 0.122634
  -> Layer: shared_layers.0.bias | Grad Mean: 0.365574 | Grad Max: 1.700921
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.005641
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003466 | Grad Max: 0.003466
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002459 | Grad Max: 0.437562
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044905 | Grad Max: 2.446554
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000263 | Grad Max: 0.011666
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021853 | Grad Max: 0.144821
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000423
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004408 | Grad Max: 0.011290
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000235
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001093 | Grad Max: 0.003242
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000562 | Grad Max: 0.001870
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015896 | Grad Max: 0.015896
[GRADIENT NORM TOTAL] 8.7082

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.029
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50845283 0.49154717] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 716/1332 | B: 641/1407 | C: 570/1478
[LOSS Ex1] A: 0.63110 | B: 0.61557 | C: 0.61395
[LOGITS Ex2 A] Mean Abs: 2.193 | Max: 6.420
[LOSS Ex2] A: 0.11226 | B: 0.30101 | C: 0.22535
** [JOINT LOSS] ** : 0.833080
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003754 | Grad Max: 0.110798
  -> Layer: shared_layers.0.bias | Grad Mean: 0.215554 | Grad Max: 1.182494
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002274 | Grad Max: 0.006043
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002331 | Grad Max: 0.002331
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001584 | Grad Max: 0.499442
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028659 | Grad Max: 2.769616
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000146 | Grad Max: 0.005523
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011895 | Grad Max: 0.065254
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000241
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002465 | Grad Max: 0.005671
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000130
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000618 | Grad Max: 0.001801
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000431 | Grad Max: 0.001585
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009679 | Grad Max: 0.009679
[GRADIENT NORM TOTAL] 6.1557

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.061
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50718826 0.49281177] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 718/1330 | B: 606/1250 | C: 581/1467
[LOSS Ex1] A: 0.63008 | B: 0.61981 | C: 0.61185
[LOGITS Ex2 A] Mean Abs: 2.143 | Max: 7.256
[LOSS Ex2] A: 0.12533 | B: 0.31602 | C: 0.17732
** [JOINT LOSS] ** : 0.826804
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003775 | Grad Max: 0.159701
  -> Layer: shared_layers.0.bias | Grad Mean: 0.394557 | Grad Max: 2.373050
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002256 | Grad Max: 0.006587
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003249 | Grad Max: 0.003249
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002558 | Grad Max: 0.561198
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046847 | Grad Max: 3.126805
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000249 | Grad Max: 0.010957
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020629 | Grad Max: 0.117225
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000322
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003920 | Grad Max: 0.009615
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000224
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000935 | Grad Max: 0.003308
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000532 | Grad Max: 0.002233
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013608 | Grad Max: 0.013608
[GRADIENT NORM TOTAL] 9.5828

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.877
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012676  0.49873236] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.078
[MASKS] A(Pass/Fail): 689/1359 | B: 651/1397 | C: 562/1486
[LOSS Ex1] A: 0.63885 | B: 0.61928 | C: 0.61558
[LOGITS Ex2 A] Mean Abs: 2.113 | Max: 6.116
[LOSS Ex2] A: 0.10498 | B: 0.34055 | C: 0.23299
** [JOINT LOSS] ** : 0.850743
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005101 | Grad Max: 0.178039
  -> Layer: shared_layers.0.bias | Grad Mean: 0.489628 | Grad Max: 2.362270
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.005671
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006572 | Grad Max: 0.006572
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003139 | Grad Max: 0.474675
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058124 | Grad Max: 2.657853
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000343 | Grad Max: 0.011229
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028696 | Grad Max: 0.145429
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000476
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005805 | Grad Max: 0.011949
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000275
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001474 | Grad Max: 0.004148
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000956 | Grad Max: 0.002410
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024350 | Grad Max: 0.024350
[GRADIENT NORM TOTAL] 10.6460

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.791
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5440161  0.45598385] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.076
[MASKS] A(Pass/Fail): 686/1362 | B: 640/1408 | C: 355/1021
[LOSS Ex1] A: 0.63925 | B: 0.61984 | C: 0.61567
[LOGITS Ex2 A] Mean Abs: 2.143 | Max: 5.402
[LOSS Ex2] A: 0.11190 | B: 0.32342 | C: 0.22403
** [JOINT LOSS] ** : 0.844706
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003082 | Grad Max: 0.077064
  -> Layer: shared_layers.0.bias | Grad Mean: 0.157031 | Grad Max: 0.754877
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005667
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008876 | Grad Max: 0.008876
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001241 | Grad Max: 0.262237
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022122 | Grad Max: 1.447779
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000093 | Grad Max: 0.004252
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007418 | Grad Max: 0.041040
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000186
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001442 | Grad Max: 0.004626
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000113
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000351 | Grad Max: 0.001220
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000438 | Grad Max: 0.001291
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004673 | Grad Max: 0.004673
[GRADIENT NORM TOTAL] 4.2061

[EPOCH SUMMARY] Train Loss: 0.8438

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8251 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8277 -> New: 0.8251)

############################## EPOCH 146/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.987
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.78961784 0.21038209] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.080
[MASKS] A(Pass/Fail): 751/1297 | B: 643/1405 | C: 590/1458
[LOSS Ex1] A: 0.63287 | B: 0.61541 | C: 0.61335
[LOGITS Ex2 A] Mean Abs: 2.194 | Max: 7.066
[LOSS Ex2] A: 0.10649 | B: 0.30606 | C: 0.24467
** [JOINT LOSS] ** : 0.839617
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004787 | Grad Max: 0.145319
  -> Layer: shared_layers.0.bias | Grad Mean: 0.331769 | Grad Max: 1.878395
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.005696
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004097 | Grad Max: 0.004097
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002052 | Grad Max: 0.522220
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037699 | Grad Max: 2.915953
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000226 | Grad Max: 0.010202
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018613 | Grad Max: 0.111564
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000317
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003705 | Grad Max: 0.008198
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000175
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000898 | Grad Max: 0.002544
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000484 | Grad Max: 0.001603
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013384 | Grad Max: 0.013384
[GRADIENT NORM TOTAL] 7.8773

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.095
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003084 0.4996916] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.079
[MASKS] A(Pass/Fail): 721/1327 | B: 607/1249 | C: 553/1495
[LOSS Ex1] A: 0.63988 | B: 0.61966 | C: 0.61669
[LOGITS Ex2 A] Mean Abs: 2.180 | Max: 6.000
[LOSS Ex2] A: 0.09448 | B: 0.30786 | C: 0.22947
** [JOINT LOSS] ** : 0.836010
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002714 | Grad Max: 0.073461
  -> Layer: shared_layers.0.bias | Grad Mean: 0.175259 | Grad Max: 0.731726
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005985
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000024 | Grad Max: 0.000024
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001203 | Grad Max: 0.178721
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021450 | Grad Max: 0.989484
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.004633
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009258 | Grad Max: 0.055175
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000232
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001844 | Grad Max: 0.005013
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000113
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000439 | Grad Max: 0.001383
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000396 | Grad Max: 0.001549
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007204 | Grad Max: 0.007204
[GRADIENT NORM TOTAL] 3.8323

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.773
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7236364  0.27636364] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.080
[MASKS] A(Pass/Fail): 714/1334 | B: 652/1396 | C: 565/1483
[LOSS Ex1] A: 0.63529 | B: 0.61914 | C: 0.61406
[LOGITS Ex2 A] Mean Abs: 2.158 | Max: 7.239
[LOSS Ex2] A: 0.12171 | B: 0.32359 | C: 0.23274
** [JOINT LOSS] ** : 0.848840
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002395 | Grad Max: 0.047443
  -> Layer: shared_layers.0.bias | Grad Mean: 0.128585 | Grad Max: 0.548426
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.005671
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003302 | Grad Max: 0.003302
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001022 | Grad Max: 0.479250
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018177 | Grad Max: 2.684140
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.004992
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006258 | Grad Max: 0.051340
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000196
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001209 | Grad Max: 0.004001
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000302 | Grad Max: 0.001250
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000300 | Grad Max: 0.001414
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005084 | Grad Max: 0.005084
[GRADIENT NORM TOTAL] 4.3335

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.893
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63357925 0.36642075] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.080
[MASKS] A(Pass/Fail): 590/1026 | B: 641/1407 | C: 551/1497
[LOSS Ex1] A: 0.63345 | B: 0.61969 | C: 0.61513
[LOGITS Ex2 A] Mean Abs: 2.249 | Max: 7.939
[LOSS Ex2] A: 0.10721 | B: 0.32874 | C: 0.25965
** [JOINT LOSS] ** : 0.854623
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003676 | Grad Max: 0.187723
  -> Layer: shared_layers.0.bias | Grad Mean: 0.472153 | Grad Max: 2.695975
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.005693
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006200 | Grad Max: 0.006200
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002907 | Grad Max: 0.477784
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053756 | Grad Max: 2.631457
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000313 | Grad Max: 0.011997
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026338 | Grad Max: 0.154328
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000389
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005030 | Grad Max: 0.011208
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000239
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001196 | Grad Max: 0.003810
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000585 | Grad Max: 0.001627
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016501 | Grad Max: 0.016501
[GRADIENT NORM TOTAL] 10.5109

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.098
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50780094 0.49219906] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 721/1327 | B: 643/1405 | C: 540/1508
[LOSS Ex1] A: 0.63416 | B: 0.61526 | C: 0.61997
[LOGITS Ex2 A] Mean Abs: 2.222 | Max: 9.099
[LOSS Ex2] A: 0.10376 | B: 0.31114 | C: 0.22388
** [JOINT LOSS] ** : 0.836057
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002791 | Grad Max: 0.118336
  -> Layer: shared_layers.0.bias | Grad Mean: 0.257826 | Grad Max: 1.479319
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002151 | Grad Max: 0.005807
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007274 | Grad Max: 0.007274
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001723 | Grad Max: 0.381173
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031463 | Grad Max: 2.123986
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000163 | Grad Max: 0.006116
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013549 | Grad Max: 0.076324
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000242
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002740 | Grad Max: 0.006447
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000158
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000685 | Grad Max: 0.002070
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000444 | Grad Max: 0.001637
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010411 | Grad Max: 0.010411
[GRADIENT NORM TOTAL] 6.1728

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.034
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50831866 0.49168137] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 716/1332 | B: 607/1249 | C: 554/1494
[LOSS Ex1] A: 0.63087 | B: 0.61951 | C: 0.61824
[LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.859
[LOSS Ex2] A: 0.11204 | B: 0.30859 | C: 0.21088
** [JOINT LOSS] ** : 0.833374
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003512 | Grad Max: 0.115865
  -> Layer: shared_layers.0.bias | Grad Mean: 0.329946 | Grad Max: 1.636300
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002306 | Grad Max: 0.006727
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010923 | Grad Max: 0.010923
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002027 | Grad Max: 0.627280
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.037904 | Grad Max: 3.488873
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000211 | Grad Max: 0.007900
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017570 | Grad Max: 0.107716
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000331
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003424 | Grad Max: 0.007552
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000173
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000825 | Grad Max: 0.002444
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000494 | Grad Max: 0.001777
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012387 | Grad Max: 0.012387
[GRADIENT NORM TOTAL] 8.2135

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.066
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073336  0.49266642] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 718/1330 | B: 652/1396 | C: 544/1504
[LOSS Ex1] A: 0.62985 | B: 0.61900 | C: 0.61705
[LOGITS Ex2 A] Mean Abs: 2.148 | Max: 6.344
[LOSS Ex2] A: 0.12762 | B: 0.34256 | C: 0.23145
** [JOINT LOSS] ** : 0.855839
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004829 | Grad Max: 0.193693
  -> Layer: shared_layers.0.bias | Grad Mean: 0.450795 | Grad Max: 2.567899
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.005881
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000923 | Grad Max: 0.000923
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002895 | Grad Max: 0.706003
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052685 | Grad Max: 3.922365
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000304 | Grad Max: 0.010742
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024943 | Grad Max: 0.135207
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000404
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004599 | Grad Max: 0.009161
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000227
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001175 | Grad Max: 0.003336
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000724 | Grad Max: 0.001894
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019636 | Grad Max: 0.019636
[GRADIENT NORM TOTAL] 10.8162

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.882
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50129914 0.49870086] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.079
[MASKS] A(Pass/Fail): 688/1360 | B: 641/1407 | C: 583/1465
[LOSS Ex1] A: 0.63863 | B: 0.61955 | C: 0.60907
[LOGITS Ex2 A] Mean Abs: 2.168 | Max: 5.791
[LOSS Ex2] A: 0.10348 | B: 0.32288 | C: 0.21869
** [JOINT LOSS] ** : 0.837433
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002439 | Grad Max: 0.054570
  -> Layer: shared_layers.0.bias | Grad Mean: 0.137617 | Grad Max: 0.755732
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.005877
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002217 | Grad Max: 0.002217
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000850 | Grad Max: 0.477638
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014942 | Grad Max: 2.662036
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.003124
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002334 | Grad Max: 0.024040
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000105
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000420 | Grad Max: 0.002161
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000089
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000114 | Grad Max: 0.000737
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000443 | Grad Max: 0.001012
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001113 | Grad Max: 0.001113
[GRADIENT NORM TOTAL] 4.4797

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.794
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438778 0.4561222] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.076
[MASKS] A(Pass/Fail): 686/1362 | B: 644/1404 | C: 556/1492
[LOSS Ex1] A: 0.63904 | B: 0.61512 | C: 0.61966
[LOGITS Ex2 A] Mean Abs: 2.145 | Max: 6.043
[LOSS Ex2] A: 0.11016 | B: 0.30988 | C: 0.21667
** [JOINT LOSS] ** : 0.836843
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003313 | Grad Max: 0.127864
  -> Layer: shared_layers.0.bias | Grad Mean: 0.330760 | Grad Max: 1.401525
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.005510
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003366 | Grad Max: 0.003366
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002241 | Grad Max: 0.291886
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041719 | Grad Max: 1.629793
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000251 | Grad Max: 0.011097
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020920 | Grad Max: 0.124850
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000381
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004144 | Grad Max: 0.008831
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000225
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001039 | Grad Max: 0.003100
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000619 | Grad Max: 0.002012
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016660 | Grad Max: 0.016660
[GRADIENT NORM TOTAL] 7.3066

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.992
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7905287  0.20947124] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.080
[MASKS] A(Pass/Fail): 751/1297 | B: 607/1249 | C: 564/1484
[LOSS Ex1] A: 0.63266 | B: 0.61937 | C: 0.61254
[LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.833
[LOSS Ex2] A: 0.10771 | B: 0.30665 | C: 0.21544
** [JOINT LOSS] ** : 0.831460
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002482 | Grad Max: 0.068362
  -> Layer: shared_layers.0.bias | Grad Mean: 0.098755 | Grad Max: 0.364832
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.006461
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006555 | Grad Max: 0.006555
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000730 | Grad Max: 0.156309
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012825 | Grad Max: 0.862393
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.002993
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004342 | Grad Max: 0.033693
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000168
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000699 | Grad Max: 0.003516
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000085
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000163 | Grad Max: 0.001001
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000459 | Grad Max: 0.001213
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000754 | Grad Max: 0.000754
[GRADIENT NORM TOTAL] 2.2772

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50027436 0.4997257 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.079
[MASKS] A(Pass/Fail): 721/1327 | B: 652/1396 | C: 608/1440
[LOSS Ex1] A: 0.63968 | B: 0.61885 | C: 0.61151
[LOGITS Ex2 A] Mean Abs: 2.219 | Max: 6.423
[LOSS Ex2] A: 0.10044 | B: 0.32406 | C: 0.22805
** [JOINT LOSS] ** : 0.840866
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002295 | Grad Max: 0.063146
  -> Layer: shared_layers.0.bias | Grad Mean: 0.102668 | Grad Max: 0.545969
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.005623
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003361 | Grad Max: 0.003361
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000696 | Grad Max: 0.244517
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011589 | Grad Max: 1.354017
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.003529
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004023 | Grad Max: 0.038273
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000143
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000602 | Grad Max: 0.002989
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000118 | Grad Max: 0.000729
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000341 | Grad Max: 0.000992
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000346 | Grad Max: 0.000346
[GRADIENT NORM TOTAL] 2.5581

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.777
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.72429097 0.27570903] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.080
[MASKS] A(Pass/Fail): 714/1334 | B: 641/1407 | C: 581/1467
[LOSS Ex1] A: 0.63508 | B: 0.61939 | C: 0.61210
[LOGITS Ex2 A] Mean Abs: 2.183 | Max: 6.060
[LOSS Ex2] A: 0.11627 | B: 0.32520 | C: 0.23020
** [JOINT LOSS] ** : 0.846076
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005200 | Grad Max: 0.138344
  -> Layer: shared_layers.0.bias | Grad Mean: 0.222020 | Grad Max: 1.130686
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.005769
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007912 | Grad Max: 0.007912
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001539 | Grad Max: 0.524224
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027586 | Grad Max: 2.948638
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.006236
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013324 | Grad Max: 0.072691
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000313
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002822 | Grad Max: 0.006321
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000172
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000713 | Grad Max: 0.002323
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000489 | Grad Max: 0.001836
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011319 | Grad Max: 0.011319
[GRADIENT NORM TOTAL] 5.6047

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.898
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63382417 0.36617583] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.080
[MASKS] A(Pass/Fail): 590/1026 | B: 644/1404 | C: 588/1460
[LOSS Ex1] A: 0.63324 | B: 0.61495 | C: 0.60861
[LOGITS Ex2 A] Mean Abs: 2.267 | Max: 6.890
[LOSS Ex2] A: 0.10701 | B: 0.30750 | C: 0.23123
** [JOINT LOSS] ** : 0.834180
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002383 | Grad Max: 0.069802
  -> Layer: shared_layers.0.bias | Grad Mean: 0.170480 | Grad Max: 0.898690
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.006232
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009363 | Grad Max: 0.009363
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001286 | Grad Max: 0.280713
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023143 | Grad Max: 1.543488
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.004776
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006902 | Grad Max: 0.054952
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001238 | Grad Max: 0.004158
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000091
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000289 | Grad Max: 0.001198
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000329 | Grad Max: 0.001095
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003372 | Grad Max: 0.003372
[GRADIENT NORM TOTAL] 4.6450

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.103
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078022  0.49219784] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 721/1327 | B: 607/1249 | C: 404/972
[LOSS Ex1] A: 0.63394 | B: 0.61917 | C: 0.60063
[LOGITS Ex2 A] Mean Abs: 2.248 | Max: 8.094
[LOSS Ex2] A: 0.09871 | B: 0.31121 | C: 0.20130
** [JOINT LOSS] ** : 0.821657
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002925 | Grad Max: 0.096777
  -> Layer: shared_layers.0.bias | Grad Mean: 0.116640 | Grad Max: 0.541411
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002239 | Grad Max: 0.005896
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005454 | Grad Max: 0.005454
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000975 | Grad Max: 0.224079
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016921 | Grad Max: 1.247358
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.003011
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002298 | Grad Max: 0.032238
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000165
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000461 | Grad Max: 0.002775
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000065
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000134 | Grad Max: 0.000648
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000451 | Grad Max: 0.001071
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001682 | Grad Max: 0.001682
[GRADIENT NORM TOTAL] 3.4716

[EPOCH SUMMARY] Train Loss: 0.8395

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8225 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8251 -> New: 0.8225)

############################## EPOCH 147/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.039
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50817853 0.4918214 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 715/1333 | B: 652/1396 | C: 595/1453
[LOSS Ex1] A: 0.63065 | B: 0.61866 | C: 0.61326
[LOGITS Ex2 A] Mean Abs: 2.248 | Max: 7.588
[LOSS Ex2] A: 0.10879 | B: 0.31946 | C: 0.22706
** [JOINT LOSS] ** : 0.839297
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003008 | Grad Max: 0.082961
  -> Layer: shared_layers.0.bias | Grad Mean: 0.211559 | Grad Max: 1.042914
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002214 | Grad Max: 0.006091
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000445 | Grad Max: 0.000445
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001008 | Grad Max: 0.542972
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017602 | Grad Max: 3.022865
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003619
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002984 | Grad Max: 0.030986
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000127
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000514 | Grad Max: 0.002747
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000062
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000144 | Grad Max: 0.000840
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000296 | Grad Max: 0.000969
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002942 | Grad Max: 0.002942
[GRADIENT NORM TOTAL] 5.5294

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.072
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074944 0.4925056] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 717/1331 | B: 641/1407 | C: 566/1482
[LOSS Ex1] A: 0.62962 | B: 0.61920 | C: 0.61560
[LOGITS Ex2 A] Mean Abs: 2.228 | Max: 7.530
[LOSS Ex2] A: 0.12412 | B: 0.32496 | C: 0.22146
** [JOINT LOSS] ** : 0.844985
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004102 | Grad Max: 0.115223
  -> Layer: shared_layers.0.bias | Grad Mean: 0.163014 | Grad Max: 0.584520
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.006487
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000693 | Grad Max: 0.000693
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000984 | Grad Max: 0.579787
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016503 | Grad Max: 3.211062
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.002719
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003655 | Grad Max: 0.026557
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000191
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000764 | Grad Max: 0.003694
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000106
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000154 | Grad Max: 0.001056
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000297 | Grad Max: 0.000776
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000536 | Grad Max: 0.000536
[GRADIENT NORM TOTAL] 4.7957

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.886
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012984  0.49870154] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.559 | Std: 0.079
[MASKS] A(Pass/Fail): 688/1360 | B: 644/1404 | C: 573/1475
[LOSS Ex1] A: 0.63840 | B: 0.61476 | C: 0.61112
[LOGITS Ex2 A] Mean Abs: 2.206 | Max: 5.896
[LOSS Ex2] A: 0.10996 | B: 0.30103 | C: 0.22501
** [JOINT LOSS] ** : 0.833422
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003520 | Grad Max: 0.108756
  -> Layer: shared_layers.0.bias | Grad Mean: 0.124714 | Grad Max: 0.807372
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.005714
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006826 | Grad Max: 0.006826
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001101 | Grad Max: 0.439941
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018925 | Grad Max: 2.466285
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000097 | Grad Max: 0.003694
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007590 | Grad Max: 0.044428
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001606 | Grad Max: 0.004418
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000102
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000414 | Grad Max: 0.001219
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000361 | Grad Max: 0.001434
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007856 | Grad Max: 0.007856
[GRADIENT NORM TOTAL] 4.2404

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.799
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438232  0.45617682] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.076
[MASKS] A(Pass/Fail): 687/1361 | B: 607/1249 | C: 575/1473
[LOSS Ex1] A: 0.63881 | B: 0.61897 | C: 0.61034
[LOGITS Ex2 A] Mean Abs: 2.201 | Max: 6.203
[LOSS Ex2] A: 0.11910 | B: 0.29827 | C: 0.22526
** [JOINT LOSS] ** : 0.836915
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003164 | Grad Max: 0.084929
  -> Layer: shared_layers.0.bias | Grad Mean: 0.206179 | Grad Max: 1.098241
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.005923
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002767 | Grad Max: 0.002767
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001195 | Grad Max: 0.347669
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021323 | Grad Max: 1.940694
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000092 | Grad Max: 0.003978
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007287 | Grad Max: 0.048631
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000173
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001344 | Grad Max: 0.004489
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000094
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000312 | Grad Max: 0.001023
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000334 | Grad Max: 0.000996
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003836 | Grad Max: 0.003836
[GRADIENT NORM TOTAL] 4.8106

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.997
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.79181033 0.20818964] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.081
[MASKS] A(Pass/Fail): 751/1297 | B: 653/1395 | C: 532/1516
[LOSS Ex1] A: 0.63242 | B: 0.61848 | C: 0.61682
[LOGITS Ex2 A] Mean Abs: 2.256 | Max: 6.608
[LOSS Ex2] A: 0.09665 | B: 0.32122 | C: 0.21944
** [JOINT LOSS] ** : 0.835009
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001867 | Grad Max: 0.084331
  -> Layer: shared_layers.0.bias | Grad Mean: 0.152113 | Grad Max: 0.873201
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.005674
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002758 | Grad Max: 0.002758
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000940 | Grad Max: 0.299713
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016782 | Grad Max: 1.661914
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.004188
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004404 | Grad Max: 0.030741
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000200
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000763 | Grad Max: 0.003813
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000185 | Grad Max: 0.001019
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000359 | Grad Max: 0.001100
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002498 | Grad Max: 0.002498
[GRADIENT NORM TOTAL] 4.1009

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.107
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002474  0.49975258] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.079
[MASKS] A(Pass/Fail): 721/1327 | B: 641/1407 | C: 548/1500
[LOSS Ex1] A: 0.63944 | B: 0.61901 | C: 0.61744
[LOGITS Ex2 A] Mean Abs: 2.233 | Max: 6.472
[LOSS Ex2] A: 0.09710 | B: 0.32031 | C: 0.23584
** [JOINT LOSS] ** : 0.843044
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004136 | Grad Max: 0.110028
  -> Layer: shared_layers.0.bias | Grad Mean: 0.249514 | Grad Max: 0.976144
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002021 | Grad Max: 0.005491
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002605 | Grad Max: 0.002605
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001572 | Grad Max: 0.208912
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028477 | Grad Max: 1.173194
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000172 | Grad Max: 0.006285
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014073 | Grad Max: 0.071386
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000286
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002900 | Grad Max: 0.006935
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000168
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000726 | Grad Max: 0.002173
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000557 | Grad Max: 0.001894
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012901 | Grad Max: 0.012901
[GRADIENT NORM TOTAL] 4.9015

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.782
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7253224  0.27467754] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.080
[MASKS] A(Pass/Fail): 714/1334 | B: 645/1403 | C: 583/1465
[LOSS Ex1] A: 0.63481 | B: 0.61456 | C: 0.60884
[LOGITS Ex2 A] Mean Abs: 2.217 | Max: 6.338
[LOSS Ex2] A: 0.12644 | B: 0.30286 | C: 0.22879
** [JOINT LOSS] ** : 0.838769
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002464 | Grad Max: 0.059432
  -> Layer: shared_layers.0.bias | Grad Mean: 0.170229 | Grad Max: 0.842809
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002238 | Grad Max: 0.005489
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003907 | Grad Max: 0.003907
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001307 | Grad Max: 0.412116
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023169 | Grad Max: 2.297638
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000097 | Grad Max: 0.004266
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007850 | Grad Max: 0.044676
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000206
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001533 | Grad Max: 0.004546
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000113
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000369 | Grad Max: 0.001371
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000292 | Grad Max: 0.001020
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005526 | Grad Max: 0.005526
[GRADIENT NORM TOTAL] 4.9415

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.903
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6342683 0.3657317] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.081
[MASKS] A(Pass/Fail): 590/1026 | B: 607/1249 | C: 571/1477
[LOSS Ex1] A: 0.63297 | B: 0.61878 | C: 0.61518
[LOGITS Ex2 A] Mean Abs: 2.274 | Max: 8.314
[LOSS Ex2] A: 0.10363 | B: 0.30870 | C: 0.22585
** [JOINT LOSS] ** : 0.835033
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.064543
  -> Layer: shared_layers.0.bias | Grad Mean: 0.080324 | Grad Max: 0.794805
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.006014
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008843 | Grad Max: 0.008843
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000724 | Grad Max: 0.141688
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012372 | Grad Max: 0.782374
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003090
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003212 | Grad Max: 0.033338
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000140
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000533 | Grad Max: 0.003041
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000069
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000151 | Grad Max: 0.000606
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000301 | Grad Max: 0.001075
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003356 | Grad Max: 0.003356
[GRADIENT NORM TOTAL] 2.5509

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.109
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50788724 0.4921128 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 721/1327 | B: 653/1395 | C: 600/1448
[LOSS Ex1] A: 0.63367 | B: 0.61829 | C: 0.60950
[LOGITS Ex2 A] Mean Abs: 2.251 | Max: 7.598
[LOSS Ex2] A: 0.09918 | B: 0.33106 | C: 0.22870
** [JOINT LOSS] ** : 0.840129
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003310 | Grad Max: 0.120129
  -> Layer: shared_layers.0.bias | Grad Mean: 0.184116 | Grad Max: 0.834594
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002162 | Grad Max: 0.006020
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003056 | Grad Max: 0.003056
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001350 | Grad Max: 0.518991
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023306 | Grad Max: 2.884586
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000106 | Grad Max: 0.005774
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008497 | Grad Max: 0.067952
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000248
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001852 | Grad Max: 0.005309
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000130
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000449 | Grad Max: 0.001411
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000274 | Grad Max: 0.001131
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006083 | Grad Max: 0.006083
[GRADIENT NORM TOTAL] 5.6004

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.046
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080892  0.49191082] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 715/1333 | B: 641/1407 | C: 563/1485
[LOSS Ex1] A: 0.63037 | B: 0.61882 | C: 0.61305
[LOGITS Ex2 A] Mean Abs: 2.254 | Max: 5.518
[LOSS Ex2] A: 0.11513 | B: 0.32133 | C: 0.21372
** [JOINT LOSS] ** : 0.837469
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003676 | Grad Max: 0.116877
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141851 | Grad Max: 0.594346
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002279 | Grad Max: 0.006254
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007061 | Grad Max: 0.007061
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001189 | Grad Max: 0.511066
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020742 | Grad Max: 2.852502
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.004290
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006337 | Grad Max: 0.048857
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000228
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001286 | Grad Max: 0.004143
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000134
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000293 | Grad Max: 0.001359
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000335 | Grad Max: 0.001023
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003120 | Grad Max: 0.003120
[GRADIENT NORM TOTAL] 5.0650

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.078
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.507666   0.49233398] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 718/1330 | B: 645/1403 | C: 559/1489
[LOSS Ex1] A: 0.62933 | B: 0.61436 | C: 0.61057
[LOGITS Ex2 A] Mean Abs: 2.208 | Max: 6.394
[LOSS Ex2] A: 0.12678 | B: 0.31369 | C: 0.21959
** [JOINT LOSS] ** : 0.838108
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003245 | Grad Max: 0.091487
  -> Layer: shared_layers.0.bias | Grad Mean: 0.252514 | Grad Max: 1.216639
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002315 | Grad Max: 0.005943
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002302 | Grad Max: 0.002302
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001729 | Grad Max: 0.271046
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031404 | Grad Max: 1.499895
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000183 | Grad Max: 0.006295
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014982 | Grad Max: 0.089770
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000266
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002858 | Grad Max: 0.007000
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000154
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000698 | Grad Max: 0.001865
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000450 | Grad Max: 0.001486
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010989 | Grad Max: 0.010989
[GRADIENT NORM TOTAL] 5.9057

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.892
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50133747 0.4986625 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.079
[MASKS] A(Pass/Fail): 687/1361 | B: 607/1249 | C: 570/1478
[LOSS Ex1] A: 0.63812 | B: 0.61859 | C: 0.61263
[LOGITS Ex2 A] Mean Abs: 2.206 | Max: 6.694
[LOSS Ex2] A: 0.10659 | B: 0.30633 | C: 0.22652
** [JOINT LOSS] ** : 0.836260
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002324 | Grad Max: 0.055613
  -> Layer: shared_layers.0.bias | Grad Mean: 0.138130 | Grad Max: 0.574237
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005471
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001866 | Grad Max: 0.001866
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000977 | Grad Max: 0.208628
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017073 | Grad Max: 1.199041
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000080 | Grad Max: 0.003440
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006079 | Grad Max: 0.037982
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000188
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001293 | Grad Max: 0.004139
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000109
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000318 | Grad Max: 0.001171
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000353 | Grad Max: 0.001313
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005552 | Grad Max: 0.005552
[GRADIENT NORM TOTAL] 3.5304

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.803
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5437758 0.4562242] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.558 | Std: 0.076
[MASKS] A(Pass/Fail): 687/1361 | B: 653/1395 | C: 585/1463
[LOSS Ex1] A: 0.63855 | B: 0.61810 | C: 0.61642
[LOGITS Ex2 A] Mean Abs: 2.248 | Max: 6.164
[LOSS Ex2] A: 0.11796 | B: 0.32797 | C: 0.22603
** [JOINT LOSS] ** : 0.848343
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004763 | Grad Max: 0.231262
  -> Layer: shared_layers.0.bias | Grad Mean: 0.534115 | Grad Max: 2.877620
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002050 | Grad Max: 0.005747
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005704 | Grad Max: 0.005704
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003347 | Grad Max: 0.693671
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.062418 | Grad Max: 3.864939
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.012818
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030748 | Grad Max: 0.172012
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000537
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005833 | Grad Max: 0.012789
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000281
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001424 | Grad Max: 0.004067
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000807 | Grad Max: 0.001948
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021649 | Grad Max: 0.021649
[GRADIENT NORM TOTAL] 12.3832

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.003
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7931068  0.20689319] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.081
[MASKS] A(Pass/Fail): 749/1299 | B: 641/1407 | C: 384/992
[LOSS Ex1] A: 0.63215 | B: 0.61862 | C: 0.61358
[LOGITS Ex2 A] Mean Abs: 2.283 | Max: 6.606
[LOSS Ex2] A: 0.10986 | B: 0.32373 | C: 0.23260
** [JOINT LOSS] ** : 0.843516
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004327 | Grad Max: 0.243253
  -> Layer: shared_layers.0.bias | Grad Mean: 0.516600 | Grad Max: 3.005739
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.006216
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001323 | Grad Max: 0.001323
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003344 | Grad Max: 0.731513
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.062033 | Grad Max: 4.084873
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000338 | Grad Max: 0.012544
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028981 | Grad Max: 0.156985
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000427
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005616 | Grad Max: 0.011615
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000305
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001385 | Grad Max: 0.004318
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000726 | Grad Max: 0.002055
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020609 | Grad Max: 0.020609
[GRADIENT NORM TOTAL] 12.5890

[EPOCH SUMMARY] Train Loss: 0.8393

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8219 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8225 -> New: 0.8219)

############################## EPOCH 148/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.114
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500153   0.49984697] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 721/1327 | B: 645/1403 | C: 579/1469
[LOSS Ex1] A: 0.63918 | B: 0.61417 | C: 0.61109
[LOGITS Ex2 A] Mean Abs: 2.257 | Max: 5.892
[LOSS Ex2] A: 0.10344 | B: 0.30445 | C: 0.24359
** [JOINT LOSS] ** : 0.838642
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004052 | Grad Max: 0.168747
  -> Layer: shared_layers.0.bias | Grad Mean: 0.174264 | Grad Max: 1.119914
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.005304
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000617 | Grad Max: 0.000617
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001251 | Grad Max: 0.386976
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020970 | Grad Max: 2.153755
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.003843
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003787 | Grad Max: 0.036609
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000162
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000418 | Grad Max: 0.002568
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000061
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000107 | Grad Max: 0.000759
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000304 | Grad Max: 0.000907
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000113 | Grad Max: 0.000113
[GRADIENT NORM TOTAL] 4.8246

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.786
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.72618556 0.2738145 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.081
[MASKS] A(Pass/Fail): 714/1334 | B: 607/1249 | C: 583/1465
[LOSS Ex1] A: 0.63454 | B: 0.61840 | C: 0.61031
[LOGITS Ex2 A] Mean Abs: 2.193 | Max: 7.155
[LOSS Ex2] A: 0.12730 | B: 0.32562 | C: 0.21426
** [JOINT LOSS] ** : 0.843476
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008432 | Grad Max: 0.240733
  -> Layer: shared_layers.0.bias | Grad Mean: 0.661299 | Grad Max: 3.080513
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.006352
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000252 | Grad Max: 0.000252
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004323 | Grad Max: 0.640907
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.079612 | Grad Max: 3.617027
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000513 | Grad Max: 0.016316
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.043046 | Grad Max: 0.224571
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000676
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008566 | Grad Max: 0.017744
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000452
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002029 | Grad Max: 0.006424
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001206 | Grad Max: 0.002956
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030645 | Grad Max: 0.030645
[GRADIENT NORM TOTAL] 14.3054

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.909
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6346236  0.36537638] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.081
[MASKS] A(Pass/Fail): 590/1026 | B: 653/1395 | C: 571/1477
[LOSS Ex1] A: 0.63269 | B: 0.61792 | C: 0.60867
[LOGITS Ex2 A] Mean Abs: 2.258 | Max: 7.502
[LOSS Ex2] A: 0.10831 | B: 0.34133 | C: 0.20809
** [JOINT LOSS] ** : 0.839001
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005432 | Grad Max: 0.244929
  -> Layer: shared_layers.0.bias | Grad Mean: 0.659577 | Grad Max: 3.512191
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002213 | Grad Max: 0.006131
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000255 | Grad Max: 0.000255
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004106 | Grad Max: 0.718885
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.076436 | Grad Max: 4.047704
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000467 | Grad Max: 0.015796
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.040053 | Grad Max: 0.216301
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000586
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007717 | Grad Max: 0.016154
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000377
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001849 | Grad Max: 0.005271
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001065 | Grad Max: 0.002593
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028235 | Grad Max: 0.028235
[GRADIENT NORM TOTAL] 14.8204

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.116
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50794154 0.49205846] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 721/1327 | B: 642/1406 | C: 570/1478
[LOSS Ex1] A: 0.63342 | B: 0.61845 | C: 0.61243
[LOGITS Ex2 A] Mean Abs: 2.284 | Max: 7.032
[LOSS Ex2] A: 0.09927 | B: 0.31766 | C: 0.21244
** [JOINT LOSS] ** : 0.831226
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003523 | Grad Max: 0.124000
  -> Layer: shared_layers.0.bias | Grad Mean: 0.163090 | Grad Max: 0.772394
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.005494
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002989 | Grad Max: 0.002989
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001189 | Grad Max: 0.446076
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021251 | Grad Max: 2.445360
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.003807
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006220 | Grad Max: 0.047666
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000213
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001369 | Grad Max: 0.004696
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000098
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000352 | Grad Max: 0.001257
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000445 | Grad Max: 0.001457
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005259 | Grad Max: 0.005259
[GRADIENT NORM TOTAL] 5.0581

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.052
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50798213 0.49201784] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 715/1333 | B: 645/1403 | C: 610/1438
[LOSS Ex1] A: 0.63012 | B: 0.61400 | C: 0.60973
[LOGITS Ex2 A] Mean Abs: 2.290 | Max: 6.265
[LOSS Ex2] A: 0.11433 | B: 0.30517 | C: 0.23490
** [JOINT LOSS] ** : 0.836084
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008644 | Grad Max: 0.230924
  -> Layer: shared_layers.0.bias | Grad Mean: 0.400923 | Grad Max: 1.441780
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002273 | Grad Max: 0.006117
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000957 | Grad Max: 0.000957
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002883 | Grad Max: 0.399275
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052360 | Grad Max: 2.217321
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.010521
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027211 | Grad Max: 0.137563
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000471
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005610 | Grad Max: 0.011327
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000273
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001392 | Grad Max: 0.004090
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000799 | Grad Max: 0.002216
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021195 | Grad Max: 0.021195
[GRADIENT NORM TOTAL] 8.5383

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.085
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50777346 0.49222654] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 719/1329 | B: 607/1249 | C: 574/1474
[LOSS Ex1] A: 0.62909 | B: 0.61823 | C: 0.61421
[LOGITS Ex2 A] Mean Abs: 2.261 | Max: 7.909
[LOSS Ex2] A: 0.12558 | B: 0.31199 | C: 0.22313
** [JOINT LOSS] ** : 0.840746
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006061 | Grad Max: 0.225433
  -> Layer: shared_layers.0.bias | Grad Mean: 0.131007 | Grad Max: 0.786776
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.006399
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000697 | Grad Max: 0.000697
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001250 | Grad Max: 0.253836
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020069 | Grad Max: 1.402760
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.003144
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003400 | Grad Max: 0.023018
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000246
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000887 | Grad Max: 0.003468
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000120
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000237 | Grad Max: 0.001377
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000276 | Grad Max: 0.000997
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003732 | Grad Max: 0.003732
[GRADIENT NORM TOTAL] 3.7512

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.897
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012983  0.49870163] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.080
[MASKS] A(Pass/Fail): 688/1360 | B: 653/1395 | C: 569/1479
[LOSS Ex1] A: 0.63790 | B: 0.61776 | C: 0.61018
[LOGITS Ex2 A] Mean Abs: 2.208 | Max: 6.763
[LOSS Ex2] A: 0.10350 | B: 0.33620 | C: 0.22778
** [JOINT LOSS] ** : 0.844443
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006795 | Grad Max: 0.197257
  -> Layer: shared_layers.0.bias | Grad Mean: 0.528215 | Grad Max: 2.278954
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002083 | Grad Max: 0.006200
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009680 | Grad Max: 0.009680
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003488 | Grad Max: 0.787651
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064619 | Grad Max: 4.374193
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000419 | Grad Max: 0.012362
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035340 | Grad Max: 0.166244
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000544
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007025 | Grad Max: 0.013838
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000360
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001723 | Grad Max: 0.005168
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001063 | Grad Max: 0.002746
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027589 | Grad Max: 0.027589
[GRADIENT NORM TOTAL] 11.8342

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.808
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438248 0.4561752] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.077
[MASKS] A(Pass/Fail): 687/1361 | B: 641/1407 | C: 574/1474
[LOSS Ex1] A: 0.63835 | B: 0.61829 | C: 0.61224
[LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.424
[LOSS Ex2] A: 0.11898 | B: 0.32295 | C: 0.20887
** [JOINT LOSS] ** : 0.839895
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007880 | Grad Max: 0.289802
  -> Layer: shared_layers.0.bias | Grad Mean: 0.250060 | Grad Max: 0.868961
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.005675
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002466 | Grad Max: 0.002466
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001756 | Grad Max: 0.697389
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030453 | Grad Max: 3.874901
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000188 | Grad Max: 0.005799
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014597 | Grad Max: 0.073580
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000383
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003180 | Grad Max: 0.007879
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000164
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000763 | Grad Max: 0.001990
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000509 | Grad Max: 0.001784
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011839 | Grad Max: 0.011839
[GRADIENT NORM TOTAL] 6.5490

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.008
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.79429924 0.20570076] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.081
[MASKS] A(Pass/Fail): 753/1295 | B: 645/1403 | C: 547/1501
[LOSS Ex1] A: 0.63194 | B: 0.61384 | C: 0.61745
[LOGITS Ex2 A] Mean Abs: 2.252 | Max: 6.879
[LOSS Ex2] A: 0.10910 | B: 0.31047 | C: 0.24161
** [JOINT LOSS] ** : 0.841467
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004807 | Grad Max: 0.273387
  -> Layer: shared_layers.0.bias | Grad Mean: 0.567028 | Grad Max: 3.457232
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.005595
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000017 | Grad Max: 0.000017
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003733 | Grad Max: 0.588589
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.067983 | Grad Max: 3.251826
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.016065
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033337 | Grad Max: 0.199080
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000481
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006283 | Grad Max: 0.013507
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000287
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001542 | Grad Max: 0.004290
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000790 | Grad Max: 0.002069
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023389 | Grad Max: 0.023389
[GRADIENT NORM TOTAL] 13.3339

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.119
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50017214 0.49982783] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 721/1327 | B: 607/1249 | C: 549/1499
[LOSS Ex1] A: 0.63899 | B: 0.61806 | C: 0.61892
[LOGITS Ex2 A] Mean Abs: 2.280 | Max: 6.107
[LOSS Ex2] A: 0.09855 | B: 0.31590 | C: 0.23217
** [JOINT LOSS] ** : 0.840865
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006169 | Grad Max: 0.239992
  -> Layer: shared_layers.0.bias | Grad Mean: 0.522170 | Grad Max: 3.158928
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002016 | Grad Max: 0.005525
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000425 | Grad Max: 0.000425
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003575 | Grad Max: 0.639206
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.063727 | Grad Max: 3.510342
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000372 | Grad Max: 0.013597
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031040 | Grad Max: 0.177144
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000410
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005592 | Grad Max: 0.011563
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000267
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001370 | Grad Max: 0.003988
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000669 | Grad Max: 0.001893
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020651 | Grad Max: 0.020651
[GRADIENT NORM TOTAL] 12.2430

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.791
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.72711104 0.27288896] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.081
[MASKS] A(Pass/Fail): 714/1334 | B: 653/1395 | C: 585/1463
[LOSS Ex1] A: 0.63434 | B: 0.61761 | C: 0.61077
[LOGITS Ex2 A] Mean Abs: 2.238 | Max: 6.788
[LOSS Ex2] A: 0.12692 | B: 0.32075 | C: 0.20830
** [JOINT LOSS] ** : 0.839566
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006425 | Grad Max: 0.277786
  -> Layer: shared_layers.0.bias | Grad Mean: 0.096087 | Grad Max: 0.615917
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.006055
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000733 | Grad Max: 0.000733
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001038 | Grad Max: 0.130181
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015506 | Grad Max: 0.610116
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000081 | Grad Max: 0.003876
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004036 | Grad Max: 0.029042
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000228
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000980 | Grad Max: 0.003995
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000092
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000238 | Grad Max: 0.001005
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000304 | Grad Max: 0.001109
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003603 | Grad Max: 0.003603
[GRADIENT NORM TOTAL] 2.6854

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.913
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6350478 0.3649522] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.081
[MASKS] A(Pass/Fail): 590/1026 | B: 642/1406 | C: 565/1483
[LOSS Ex1] A: 0.63249 | B: 0.61814 | C: 0.61178
[LOGITS Ex2 A] Mean Abs: 2.249 | Max: 7.915
[LOSS Ex2] A: 0.11599 | B: 0.32487 | C: 0.23232
** [JOINT LOSS] ** : 0.845200
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003401 | Grad Max: 0.096774
  -> Layer: shared_layers.0.bias | Grad Mean: 0.295513 | Grad Max: 1.116887
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006511
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011236 | Grad Max: 0.011236
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001744 | Grad Max: 0.389030
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032238 | Grad Max: 2.178115
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000189 | Grad Max: 0.009010
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015994 | Grad Max: 0.103976
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000315
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003110 | Grad Max: 0.007067
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000177
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000794 | Grad Max: 0.002178
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000562 | Grad Max: 0.001657
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014497 | Grad Max: 0.014497
[GRADIENT NORM TOTAL] 6.1930

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.121
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50791514 0.49208483] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.082
[MASKS] A(Pass/Fail): 721/1327 | B: 645/1403 | C: 559/1489
[LOSS Ex1] A: 0.63322 | B: 0.61370 | C: 0.61431
[LOGITS Ex2 A] Mean Abs: 2.266 | Max: 7.681
[LOSS Ex2] A: 0.11250 | B: 0.30226 | C: 0.24155
** [JOINT LOSS] ** : 0.839176
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008813 | Grad Max: 0.400017
  -> Layer: shared_layers.0.bias | Grad Mean: 0.201370 | Grad Max: 0.859841
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.005517
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004762 | Grad Max: 0.004762
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001738 | Grad Max: 0.394614
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028606 | Grad Max: 1.980252
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000159 | Grad Max: 0.005886
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011916 | Grad Max: 0.065343
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000404
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002940 | Grad Max: 0.006744
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000202
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000720 | Grad Max: 0.002004
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000486 | Grad Max: 0.001631
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011036 | Grad Max: 0.011036
[GRADIENT NORM TOTAL] 5.4021

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.057
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5079038 0.4920962] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 715/1333 | B: 607/1249 | C: 396/980
[LOSS Ex1] A: 0.62991 | B: 0.61791 | C: 0.61372
[LOGITS Ex2 A] Mean Abs: 2.262 | Max: 6.255
[LOSS Ex2] A: 0.11775 | B: 0.30376 | C: 0.23262
** [JOINT LOSS] ** : 0.838559
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010056 | Grad Max: 0.413016
  -> Layer: shared_layers.0.bias | Grad Mean: 0.185530 | Grad Max: 0.893856
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.005900
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002463 | Grad Max: 0.002463
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001512 | Grad Max: 0.364667
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022756 | Grad Max: 1.785062
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.005387
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005737 | Grad Max: 0.039202
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000306
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001545 | Grad Max: 0.004837
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000137
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000344 | Grad Max: 0.001438
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000268 | Grad Max: 0.001313
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004194 | Grad Max: 0.004194
[GRADIENT NORM TOTAL] 4.6611

[EPOCH SUMMARY] Train Loss: 0.8399

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8201 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8219 -> New: 0.8201)

############################## EPOCH 149/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.090
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078609  0.49213904] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 719/1329 | B: 653/1395 | C: 586/1462
[LOSS Ex1] A: 0.62888 | B: 0.61747 | C: 0.61554
[LOGITS Ex2 A] Mean Abs: 2.186 | Max: 6.160
[LOSS Ex2] A: 0.12498 | B: 0.32958 | C: 0.23491
** [JOINT LOSS] ** : 0.850455
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007338 | Grad Max: 0.295747
  -> Layer: shared_layers.0.bias | Grad Mean: 0.277528 | Grad Max: 1.423817
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.006186
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003496 | Grad Max: 0.003496
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.283897
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035058 | Grad Max: 1.556970
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000201 | Grad Max: 0.008805
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014898 | Grad Max: 0.108191
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000242
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002392 | Grad Max: 0.005436
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000134
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000633 | Grad Max: 0.001885
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000345 | Grad Max: 0.001270
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011323 | Grad Max: 0.011323
[GRADIENT NORM TOTAL] 6.4258

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.901
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013038  0.49869618] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.080
[MASKS] A(Pass/Fail): 687/1361 | B: 642/1406 | C: 557/1491
[LOSS Ex1] A: 0.63770 | B: 0.61799 | C: 0.61358
[LOGITS Ex2 A] Mean Abs: 2.206 | Max: 6.135
[LOSS Ex2] A: 0.10240 | B: 0.32253 | C: 0.21857
** [JOINT LOSS] ** : 0.837590
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002653 | Grad Max: 0.095454
  -> Layer: shared_layers.0.bias | Grad Mean: 0.148549 | Grad Max: 1.185788
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005733
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008016 | Grad Max: 0.008016
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000873 | Grad Max: 0.328486
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015312 | Grad Max: 1.831441
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.002748
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002219 | Grad Max: 0.021813
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000127
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000374 | Grad Max: 0.002637
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000112 | Grad Max: 0.001085
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000406 | Grad Max: 0.001214
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000515 | Grad Max: 0.000515
[GRADIENT NORM TOTAL] 3.9925

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.811
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54374146 0.45625857] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.077
[MASKS] A(Pass/Fail): 688/1360 | B: 645/1403 | C: 573/1475
[LOSS Ex1] A: 0.63815 | B: 0.61354 | C: 0.60960
[LOGITS Ex2 A] Mean Abs: 2.204 | Max: 6.308
[LOSS Ex2] A: 0.11379 | B: 0.29815 | C: 0.22330
** [JOINT LOSS] ** : 0.832178
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005911 | Grad Max: 0.215428
  -> Layer: shared_layers.0.bias | Grad Mean: 0.338015 | Grad Max: 2.176780
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.005780
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007644 | Grad Max: 0.007644
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002026 | Grad Max: 0.471597
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034735 | Grad Max: 2.614603
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000151 | Grad Max: 0.006400
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011064 | Grad Max: 0.079390
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000250
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001712 | Grad Max: 0.004549
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000095
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000436 | Grad Max: 0.001277
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000369 | Grad Max: 0.001027
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006153 | Grad Max: 0.006153
[GRADIENT NORM TOTAL] 7.5870

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.013
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7952576  0.20474239] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.081
[MASKS] A(Pass/Fail): 753/1295 | B: 607/1249 | C: 578/1470
[LOSS Ex1] A: 0.63172 | B: 0.61776 | C: 0.61453
[LOGITS Ex2 A] Mean Abs: 2.238 | Max: 6.937
[LOSS Ex2] A: 0.11064 | B: 0.30467 | C: 0.23354
** [JOINT LOSS] ** : 0.837624
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006133 | Grad Max: 0.228967
  -> Layer: shared_layers.0.bias | Grad Mean: 0.227007 | Grad Max: 1.429535
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.006164
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008347 | Grad Max: 0.008347
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001544 | Grad Max: 0.398806
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025809 | Grad Max: 2.187736
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.005565
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005762 | Grad Max: 0.066694
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000194
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000632 | Grad Max: 0.003767
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000068
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000177 | Grad Max: 0.000958
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000318 | Grad Max: 0.000816
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002180 | Grad Max: 0.002180
[GRADIENT NORM TOTAL] 5.2652

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.125
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50007623 0.4999238 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 721/1327 | B: 654/1394 | C: 574/1474
[LOSS Ex1] A: 0.63878 | B: 0.61732 | C: 0.61267
[LOGITS Ex2 A] Mean Abs: 2.215 | Max: 6.600
[LOSS Ex2] A: 0.11256 | B: 0.33534 | C: 0.21128
** [JOINT LOSS] ** : 0.842648
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010415 | Grad Max: 0.375245
  -> Layer: shared_layers.0.bias | Grad Mean: 0.440335 | Grad Max: 2.069128
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002060 | Grad Max: 0.005830
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001654 | Grad Max: 0.001654
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002923 | Grad Max: 0.731435
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051470 | Grad Max: 4.074980
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000294 | Grad Max: 0.008467
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023786 | Grad Max: 0.113171
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000573
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005263 | Grad Max: 0.011656
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000302
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001253 | Grad Max: 0.003795
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000821 | Grad Max: 0.002311
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019501 | Grad Max: 0.019501
[GRADIENT NORM TOTAL] 10.2976

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.794
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.727703   0.27229697] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.081
[MASKS] A(Pass/Fail): 714/1334 | B: 644/1404 | C: 593/1455
[LOSS Ex1] A: 0.63411 | B: 0.61784 | C: 0.60555
[LOGITS Ex2 A] Mean Abs: 2.220 | Max: 6.718
[LOSS Ex2] A: 0.12190 | B: 0.32468 | C: 0.20890
** [JOINT LOSS] ** : 0.837659
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007396 | Grad Max: 0.240756
  -> Layer: shared_layers.0.bias | Grad Mean: 0.366453 | Grad Max: 1.773302
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.006574
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011153 | Grad Max: 0.011153
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002019 | Grad Max: 0.638623
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035634 | Grad Max: 3.567219
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000192 | Grad Max: 0.006111
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015368 | Grad Max: 0.071504
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000343
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003356 | Grad Max: 0.008302
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000176
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000803 | Grad Max: 0.002415
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000570 | Grad Max: 0.001915
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013291 | Grad Max: 0.013291
[GRADIENT NORM TOTAL] 8.5848

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.918
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6353262 0.3646738] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.081
[MASKS] A(Pass/Fail): 590/1026 | B: 646/1402 | C: 577/1471
[LOSS Ex1] A: 0.63226 | B: 0.61339 | C: 0.61548
[LOGITS Ex2 A] Mean Abs: 2.290 | Max: 7.722
[LOSS Ex2] A: 0.10801 | B: 0.30151 | C: 0.23413
** [JOINT LOSS] ** : 0.834929
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003013 | Grad Max: 0.105536
  -> Layer: shared_layers.0.bias | Grad Mean: 0.293278 | Grad Max: 1.318778
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005836
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005311 | Grad Max: 0.005311
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002257 | Grad Max: 0.485572
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041330 | Grad Max: 2.703089
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000218 | Grad Max: 0.007507
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018481 | Grad Max: 0.096666
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000323
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003430 | Grad Max: 0.007728
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000179
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000838 | Grad Max: 0.002708
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000420 | Grad Max: 0.001610
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011734 | Grad Max: 0.011734
[GRADIENT NORM TOTAL] 7.8017

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.127
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080019  0.49199808] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.082
[MASKS] A(Pass/Fail): 721/1327 | B: 608/1248 | C: 559/1489
[LOSS Ex1] A: 0.63299 | B: 0.61761 | C: 0.61260
[LOGITS Ex2 A] Mean Abs: 2.282 | Max: 7.408
[LOSS Ex2] A: 0.09860 | B: 0.30142 | C: 0.22150
** [JOINT LOSS] ** : 0.828243
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005405 | Grad Max: 0.211060
  -> Layer: shared_layers.0.bias | Grad Mean: 0.302336 | Grad Max: 1.280491
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002244 | Grad Max: 0.005827
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007650 | Grad Max: 0.007650
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002280 | Grad Max: 0.278072
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041069 | Grad Max: 1.517658
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000242 | Grad Max: 0.008995
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020167 | Grad Max: 0.109075
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000316
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004226 | Grad Max: 0.008694
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001106 | Grad Max: 0.002969
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000601 | Grad Max: 0.002133
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017214 | Grad Max: 0.017214
[GRADIENT NORM TOTAL] 6.9460

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.062
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50780654 0.4921934 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.082
[MASKS] A(Pass/Fail): 715/1333 | B: 654/1394 | C: 595/1453
[LOSS Ex1] A: 0.62968 | B: 0.61718 | C: 0.61210
[LOGITS Ex2 A] Mean Abs: 2.246 | Max: 5.305
[LOSS Ex2] A: 0.10564 | B: 0.33649 | C: 0.23694
** [JOINT LOSS] ** : 0.846012
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004191 | Grad Max: 0.133986
  -> Layer: shared_layers.0.bias | Grad Mean: 0.347512 | Grad Max: 1.843104
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.006589
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005553 | Grad Max: 0.005553
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002241 | Grad Max: 0.260218
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040158 | Grad Max: 1.446332
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000194 | Grad Max: 0.008099
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016475 | Grad Max: 0.097674
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000254
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002928 | Grad Max: 0.006956
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000163
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000697 | Grad Max: 0.002249
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000346 | Grad Max: 0.001155
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010537 | Grad Max: 0.010537
[GRADIENT NORM TOTAL] 7.6014

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.095
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080095 0.4919905] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 718/1330 | B: 644/1404 | C: 602/1446
[LOSS Ex1] A: 0.62865 | B: 0.61770 | C: 0.60844
[LOGITS Ex2 A] Mean Abs: 2.200 | Max: 7.684
[LOSS Ex2] A: 0.12119 | B: 0.32697 | C: 0.23517
** [JOINT LOSS] ** : 0.846045
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004874 | Grad Max: 0.174399
  -> Layer: shared_layers.0.bias | Grad Mean: 0.362096 | Grad Max: 1.856905
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.006139
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006520 | Grad Max: 0.006520
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002495 | Grad Max: 0.277167
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044080 | Grad Max: 1.527525
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000250 | Grad Max: 0.010258
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020795 | Grad Max: 0.144833
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000291
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003718 | Grad Max: 0.008629
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000170
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000936 | Grad Max: 0.002615
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000535 | Grad Max: 0.001606
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015671 | Grad Max: 0.015671
[GRADIENT NORM TOTAL] 7.9223

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.906
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013242  0.49867585] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.080
[MASKS] A(Pass/Fail): 687/1361 | B: 646/1402 | C: 549/1499
[LOSS Ex1] A: 0.63748 | B: 0.61325 | C: 0.61817
[LOGITS Ex2 A] Mean Abs: 2.204 | Max: 5.896
[LOSS Ex2] A: 0.09952 | B: 0.30116 | C: 0.24125
** [JOINT LOSS] ** : 0.836944
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004379 | Grad Max: 0.105056
  -> Layer: shared_layers.0.bias | Grad Mean: 0.269539 | Grad Max: 1.245909
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.005232
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003668 | Grad Max: 0.003668
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001611 | Grad Max: 0.448073
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029059 | Grad Max: 2.505992
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.004579
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011617 | Grad Max: 0.055547
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000229
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002384 | Grad Max: 0.006414
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000166
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000592 | Grad Max: 0.002168
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000408 | Grad Max: 0.001480
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008993 | Grad Max: 0.008993
[GRADIENT NORM TOTAL] 6.4298

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.815
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54365367 0.4563463 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.077
[MASKS] A(Pass/Fail): 688/1360 | B: 608/1248 | C: 586/1462
[LOSS Ex1] A: 0.63794 | B: 0.61746 | C: 0.60824
[LOGITS Ex2 A] Mean Abs: 2.200 | Max: 6.236
[LOSS Ex2] A: 0.11612 | B: 0.29976 | C: 0.19290
** [JOINT LOSS] ** : 0.824143
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003182 | Grad Max: 0.138788
  -> Layer: shared_layers.0.bias | Grad Mean: 0.235455 | Grad Max: 1.526980
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002120 | Grad Max: 0.005806
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006235 | Grad Max: 0.006235
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001409 | Grad Max: 0.357189
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025390 | Grad Max: 1.988703
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000105 | Grad Max: 0.005967
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008717 | Grad Max: 0.056744
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000213
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001865 | Grad Max: 0.005304
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000114
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000508 | Grad Max: 0.001623
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000455 | Grad Max: 0.001480
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008352 | Grad Max: 0.008352
[GRADIENT NORM TOTAL] 5.6023

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.017
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7962062  0.20379382] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.082
[MASKS] A(Pass/Fail): 748/1300 | B: 655/1393 | C: 570/1478
[LOSS Ex1] A: 0.63152 | B: 0.61704 | C: 0.60909
[LOGITS Ex2 A] Mean Abs: 2.233 | Max: 6.111
[LOSS Ex2] A: 0.10174 | B: 0.32398 | C: 0.21004
** [JOINT LOSS] ** : 0.831135
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.056891
  -> Layer: shared_layers.0.bias | Grad Mean: 0.125098 | Grad Max: 0.685857
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.006157
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002652 | Grad Max: 0.002652
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001045 | Grad Max: 0.148672
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018952 | Grad Max: 0.831788
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000080 | Grad Max: 0.004002
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006536 | Grad Max: 0.038221
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000178
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001283 | Grad Max: 0.004097
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000102
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000300 | Grad Max: 0.000999
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000364 | Grad Max: 0.001265
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005219 | Grad Max: 0.005219
[GRADIENT NORM TOTAL] 3.4499

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.130
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000562  0.49994382] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.080
[MASKS] A(Pass/Fail): 721/1327 | B: 644/1404 | C: 357/1019
[LOSS Ex1] A: 0.63859 | B: 0.61755 | C: 0.61275
[LOGITS Ex2 A] Mean Abs: 2.269 | Max: 6.778
[LOSS Ex2] A: 0.09475 | B: 0.32263 | C: 0.20967
** [JOINT LOSS] ** : 0.831978
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004249 | Grad Max: 0.150482
  -> Layer: shared_layers.0.bias | Grad Mean: 0.078697 | Grad Max: 0.600354
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002075 | Grad Max: 0.005434
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002130 | Grad Max: 0.002130
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000780 | Grad Max: 0.105018
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012371 | Grad Max: 0.585196
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.002597
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002568 | Grad Max: 0.024560
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000152
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000361 | Grad Max: 0.002350
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000090
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000097 | Grad Max: 0.000639
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000407 | Grad Max: 0.001334
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001492 | Grad Max: 0.001492
[GRADIENT NORM TOTAL] 2.2450

[EPOCH SUMMARY] Train Loss: 0.8370

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8174 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8201 -> New: 0.8174)

############################## EPOCH 150/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.798
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7283624 0.2716376] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.081
[MASKS] A(Pass/Fail): 714/1334 | B: 646/1402 | C: 572/1476
[LOSS Ex1] A: 0.63390 | B: 0.61309 | C: 0.61172
[LOGITS Ex2 A] Mean Abs: 2.237 | Max: 6.820
[LOSS Ex2] A: 0.12262 | B: 0.30470 | C: 0.22358
** [JOINT LOSS] ** : 0.836537
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003021 | Grad Max: 0.062888
  -> Layer: shared_layers.0.bias | Grad Mean: 0.152465 | Grad Max: 0.605028
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.005929
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000624 | Grad Max: 0.000624
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000988 | Grad Max: 0.456241
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016892 | Grad Max: 2.566595
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.002698
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002638 | Grad Max: 0.025500
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000125
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000430 | Grad Max: 0.003058
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000084
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000101 | Grad Max: 0.000632
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000306 | Grad Max: 0.000994
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000019 | Grad Max: 0.000019
[GRADIENT NORM TOTAL] 4.8113

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.922
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6355784  0.36442164] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.082
[MASKS] A(Pass/Fail): 590/1026 | B: 608/1248 | C: 585/1463
[LOSS Ex1] A: 0.63205 | B: 0.61729 | C: 0.60588
[LOGITS Ex2 A] Mean Abs: 2.309 | Max: 8.461
[LOSS Ex2] A: 0.09905 | B: 0.29895 | C: 0.20574
** [JOINT LOSS] ** : 0.819651
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002592 | Grad Max: 0.062312
  -> Layer: shared_layers.0.bias | Grad Mean: 0.177754 | Grad Max: 0.697123
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002170 | Grad Max: 0.006090
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006580 | Grad Max: 0.006580
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001141 | Grad Max: 0.549817
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020275 | Grad Max: 3.059267
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.005483
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006416 | Grad Max: 0.057632
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000200
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001199 | Grad Max: 0.004209
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000083
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000270 | Grad Max: 0.001148
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000402 | Grad Max: 0.001257
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003562 | Grad Max: 0.003562
[GRADIENT NORM TOTAL] 5.1088

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.132
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080231  0.49197695] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.082
[MASKS] A(Pass/Fail): 721/1327 | B: 655/1393 | C: 588/1460
[LOSS Ex1] A: 0.63278 | B: 0.61687 | C: 0.61222
[LOGITS Ex2 A] Mean Abs: 2.287 | Max: 8.585
[LOSS Ex2] A: 0.09971 | B: 0.31983 | C: 0.21800
** [JOINT LOSS] ** : 0.833133
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003319 | Grad Max: 0.092843
  -> Layer: shared_layers.0.bias | Grad Mean: 0.140289 | Grad Max: 0.827102
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.005909
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003560 | Grad Max: 0.003560
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001002 | Grad Max: 0.253330
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017482 | Grad Max: 1.422133
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000098 | Grad Max: 0.004137
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008207 | Grad Max: 0.044274
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000207
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001716 | Grad Max: 0.004402
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000121
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000406 | Grad Max: 0.001252
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000358 | Grad Max: 0.001054
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005244 | Grad Max: 0.005244
[GRADIENT NORM TOTAL] 3.4733

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.067
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50764364 0.49235636] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.082
[MASKS] A(Pass/Fail): 715/1333 | B: 644/1404 | C: 563/1485
[LOSS Ex1] A: 0.62947 | B: 0.61737 | C: 0.61344
[LOGITS Ex2 A] Mean Abs: 2.284 | Max: 6.556
[LOSS Ex2] A: 0.10988 | B: 0.31730 | C: 0.23122
** [JOINT LOSS] ** : 0.839558
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003735 | Grad Max: 0.124624
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134374 | Grad Max: 0.592684
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.006352
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004454 | Grad Max: 0.004454
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001107 | Grad Max: 0.193662
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019168 | Grad Max: 1.074257
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.005241
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007117 | Grad Max: 0.056039
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000144
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001337 | Grad Max: 0.003774
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000366 | Grad Max: 0.001407
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000288 | Grad Max: 0.001242
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007408 | Grad Max: 0.007408
[GRADIENT NORM TOTAL] 3.5693

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50814277 0.49185726] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 717/1331 | B: 646/1402 | C: 563/1485
[LOSS Ex1] A: 0.62843 | B: 0.61292 | C: 0.61443
[LOGITS Ex2 A] Mean Abs: 2.266 | Max: 6.942
[LOSS Ex2] A: 0.12202 | B: 0.29275 | C: 0.21722
** [JOINT LOSS] ** : 0.829255
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004910 | Grad Max: 0.161685
  -> Layer: shared_layers.0.bias | Grad Mean: 0.156715 | Grad Max: 0.799183
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.006426
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003505 | Grad Max: 0.003505
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001152 | Grad Max: 0.279809
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019708 | Grad Max: 1.562342
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.004920
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007434 | Grad Max: 0.047898
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000258
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001603 | Grad Max: 0.004934
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000128
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000376 | Grad Max: 0.001278
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000376 | Grad Max: 0.001361
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004733 | Grad Max: 0.004733
[GRADIENT NORM TOTAL] 3.8622

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.911
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013079  0.49869213] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.080
[MASKS] A(Pass/Fail): 687/1361 | B: 608/1248 | C: 586/1462
[LOSS Ex1] A: 0.63725 | B: 0.61709 | C: 0.60854
[LOGITS Ex2 A] Mean Abs: 2.220 | Max: 6.264
[LOSS Ex2] A: 0.10556 | B: 0.30868 | C: 0.23407
** [JOINT LOSS] ** : 0.837063
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003104 | Grad Max: 0.095839
  -> Layer: shared_layers.0.bias | Grad Mean: 0.256144 | Grad Max: 1.286984
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.005713
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004798 | Grad Max: 0.004798
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001597 | Grad Max: 0.549105
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028952 | Grad Max: 3.059704
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000160 | Grad Max: 0.005769
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013527 | Grad Max: 0.065727
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000297
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002706 | Grad Max: 0.006246
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000190
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000651 | Grad Max: 0.002439
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.001473
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009918 | Grad Max: 0.009918
[GRADIENT NORM TOTAL] 6.8848

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.819
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54354715 0.45645288] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.077
[MASKS] A(Pass/Fail): 688/1360 | B: 655/1393 | C: 586/1462
[LOSS Ex1] A: 0.63772 | B: 0.61669 | C: 0.61281
[LOGITS Ex2 A] Mean Abs: 2.225 | Max: 6.296
[LOSS Ex2] A: 0.11460 | B: 0.31993 | C: 0.25148
** [JOINT LOSS] ** : 0.851075
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005479 | Grad Max: 0.175898
  -> Layer: shared_layers.0.bias | Grad Mean: 0.154084 | Grad Max: 0.826623
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005513
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008562 | Grad Max: 0.008562
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001312 | Grad Max: 0.470352
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022797 | Grad Max: 2.624824
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.003548
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009052 | Grad Max: 0.041698
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000264
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002049 | Grad Max: 0.005141
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000127
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000551 | Grad Max: 0.001754
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000401 | Grad Max: 0.001339
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010071 | Grad Max: 0.010071
[GRADIENT NORM TOTAL] 4.8038

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.022
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7973016  0.20269844] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.082
[MASKS] A(Pass/Fail): 748/1300 | B: 644/1404 | C: 591/1457
[LOSS Ex1] A: 0.63128 | B: 0.61719 | C: 0.61102
[LOGITS Ex2 A] Mean Abs: 2.314 | Max: 7.351
[LOSS Ex2] A: 0.09691 | B: 0.31611 | C: 0.22774
** [JOINT LOSS] ** : 0.833413
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005985 | Grad Max: 0.233211
  -> Layer: shared_layers.0.bias | Grad Mean: 0.614676 | Grad Max: 3.069510
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.006133
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001937 | Grad Max: 0.001937
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003906 | Grad Max: 0.656105
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.072788 | Grad Max: 3.667587
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000439 | Grad Max: 0.016200
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.038241 | Grad Max: 0.226423
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000549
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007407 | Grad Max: 0.014691
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000393
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001799 | Grad Max: 0.005117
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000944 | Grad Max: 0.002443
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026145 | Grad Max: 0.026145
[GRADIENT NORM TOTAL] 13.5825

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.136
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000295  0.49997053] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.081
[MASKS] A(Pass/Fail): 722/1326 | B: 646/1402 | C: 598/1450
[LOSS Ex1] A: 0.63835 | B: 0.61273 | C: 0.60777
[LOGITS Ex2 A] Mean Abs: 2.303 | Max: 6.290
[LOSS Ex2] A: 0.09401 | B: 0.30569 | C: 0.22571
** [JOINT LOSS] ** : 0.828085
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003216 | Grad Max: 0.151026
  -> Layer: shared_layers.0.bias | Grad Mean: 0.372298 | Grad Max: 2.001792
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002113 | Grad Max: 0.005477
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005665 | Grad Max: 0.005665
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002544 | Grad Max: 0.476935
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046827 | Grad Max: 2.678825
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000240 | Grad Max: 0.010679
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020948 | Grad Max: 0.133889
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000338
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003918 | Grad Max: 0.009144
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000199
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000924 | Grad Max: 0.003122
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000441 | Grad Max: 0.001525
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012729 | Grad Max: 0.012729
[GRADIENT NORM TOTAL] 9.2387

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.802
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7290793 0.2709207] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.082
[MASKS] A(Pass/Fail): 713/1335 | B: 608/1248 | C: 563/1485
[LOSS Ex1] A: 0.63365 | B: 0.61691 | C: 0.61223
[LOGITS Ex2 A] Mean Abs: 2.245 | Max: 6.748
[LOSS Ex2] A: 0.12569 | B: 0.30188 | C: 0.21097
** [JOINT LOSS] ** : 0.833780
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005138 | Grad Max: 0.171460
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207652 | Grad Max: 0.811173
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.005922
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002197 | Grad Max: 0.002197
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001526 | Grad Max: 0.173822
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026818 | Grad Max: 0.877980
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000155 | Grad Max: 0.005259
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012647 | Grad Max: 0.071175
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000291
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002589 | Grad Max: 0.006558
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000176
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000593 | Grad Max: 0.002490
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000392 | Grad Max: 0.001660
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008600 | Grad Max: 0.008600
[GRADIENT NORM TOTAL] 4.4059

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.927
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63589865 0.36410135] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.082
[MASKS] A(Pass/Fail): 590/1026 | B: 655/1393 | C: 589/1459
[LOSS Ex1] A: 0.63181 | B: 0.61652 | C: 0.60619
[LOGITS Ex2 A] Mean Abs: 2.300 | Max: 9.895
[LOSS Ex2] A: 0.10115 | B: 0.31733 | C: 0.19664
** [JOINT LOSS] ** : 0.823216
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003780 | Grad Max: 0.100368
  -> Layer: shared_layers.0.bias | Grad Mean: 0.248829 | Grad Max: 0.981002
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.005803
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005058 | Grad Max: 0.005058
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001480 | Grad Max: 0.230409
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026743 | Grad Max: 1.281951
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000128 | Grad Max: 0.004986
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010899 | Grad Max: 0.072478
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000220
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001989 | Grad Max: 0.005390
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000160
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000431 | Grad Max: 0.001796
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000393 | Grad Max: 0.001614
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006096 | Grad Max: 0.006096
[GRADIENT NORM TOTAL] 5.1870

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.138
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50800174 0.49199826] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.082
[MASKS] A(Pass/Fail): 722/1326 | B: 644/1404 | C: 563/1485
[LOSS Ex1] A: 0.63253 | B: 0.61702 | C: 0.61314
[LOGITS Ex2 A] Mean Abs: 2.312 | Max: 8.742
[LOSS Ex2] A: 0.10277 | B: 0.31695 | C: 0.24608
** [JOINT LOSS] ** : 0.842829
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007037 | Grad Max: 0.189326
  -> Layer: shared_layers.0.bias | Grad Mean: 0.455688 | Grad Max: 2.230294
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.005687
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005237 | Grad Max: 0.005237
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003196 | Grad Max: 0.539846
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058896 | Grad Max: 3.035585
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000342 | Grad Max: 0.011699
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029193 | Grad Max: 0.171446
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000530
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005812 | Grad Max: 0.012738
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000308
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001400 | Grad Max: 0.004380
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000741 | Grad Max: 0.002006
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019827 | Grad Max: 0.019827
[GRADIENT NORM TOTAL] 10.4140

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50755113 0.49244887] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.082
[MASKS] A(Pass/Fail): 715/1333 | B: 646/1402 | C: 568/1480
[LOSS Ex1] A: 0.62923 | B: 0.61256 | C: 0.61511
[LOGITS Ex2 A] Mean Abs: 2.316 | Max: 6.760
[LOSS Ex2] A: 0.10934 | B: 0.30089 | C: 0.20418
** [JOINT LOSS] ** : 0.823769
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007539 | Grad Max: 0.195031
  -> Layer: shared_layers.0.bias | Grad Mean: 0.375303 | Grad Max: 1.520992
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002248 | Grad Max: 0.005989
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003333 | Grad Max: 0.003333
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002709 | Grad Max: 0.390236
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049560 | Grad Max: 2.118659
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000290 | Grad Max: 0.008869
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024408 | Grad Max: 0.130866
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000409
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005019 | Grad Max: 0.010423
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000251
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001243 | Grad Max: 0.003694
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000705 | Grad Max: 0.002193
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018946 | Grad Max: 0.018946
[GRADIENT NORM TOTAL] 8.4697

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.107
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082463  0.49175373] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.082
[MASKS] A(Pass/Fail): 717/1331 | B: 608/1248 | C: 368/1008
[LOSS Ex1] A: 0.62819 | B: 0.61675 | C: 0.61723
[LOGITS Ex2 A] Mean Abs: 2.231 | Max: 6.957
[LOSS Ex2] A: 0.11802 | B: 0.30828 | C: 0.22250
** [JOINT LOSS] ** : 0.836989
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004290 | Grad Max: 0.181603
  -> Layer: shared_layers.0.bias | Grad Mean: 0.474487 | Grad Max: 2.337373
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.006307
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004165 | Grad Max: 0.004165
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002905 | Grad Max: 0.591017
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053546 | Grad Max: 3.244539
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000320 | Grad Max: 0.010252
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027756 | Grad Max: 0.143026
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000478
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005185 | Grad Max: 0.012410
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000265
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001251 | Grad Max: 0.003783
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000729 | Grad Max: 0.002165
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019502 | Grad Max: 0.019502
[GRADIENT NORM TOTAL] 10.4690

[EPOCH SUMMARY] Train Loss: 0.8335

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8202 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 151/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.916
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012677 0.4987323] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.081
[MASKS] A(Pass/Fail): 688/1360 | B: 656/1392 | C: 562/1486
[LOSS Ex1] A: 0.63703 | B: 0.61637 | C: 0.61047
[LOGITS Ex2 A] Mean Abs: 2.213 | Max: 5.732
[LOSS Ex2] A: 0.10685 | B: 0.33384 | C: 0.21725
** [JOINT LOSS] ** : 0.840601
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004339 | Grad Max: 0.201389
  -> Layer: shared_layers.0.bias | Grad Mean: 0.531898 | Grad Max: 2.716517
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.005494
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003998 | Grad Max: 0.003998
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003505 | Grad Max: 0.673421
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065270 | Grad Max: 3.781298
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000372 | Grad Max: 0.013752
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.032657 | Grad Max: 0.177564
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000487
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006344 | Grad Max: 0.013438
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000326
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001516 | Grad Max: 0.004482
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000851 | Grad Max: 0.002172
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023054 | Grad Max: 0.023054
[GRADIENT NORM TOTAL] 12.3560

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.822
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5435288 0.4564712] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.078
[MASKS] A(Pass/Fail): 688/1360 | B: 644/1404 | C: 600/1448
[LOSS Ex1] A: 0.63751 | B: 0.61687 | C: 0.60988
[LOGITS Ex2 A] Mean Abs: 2.230 | Max: 5.923
[LOSS Ex2] A: 0.11357 | B: 0.31748 | C: 0.22220
** [JOINT LOSS] ** : 0.839170
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.052870
  -> Layer: shared_layers.0.bias | Grad Mean: 0.154794 | Grad Max: 1.036783
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.005690
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006130 | Grad Max: 0.006130
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000994 | Grad Max: 0.492011
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017731 | Grad Max: 2.736600
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.003463
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006143 | Grad Max: 0.036642
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000189
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001110 | Grad Max: 0.004088
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000092
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000242 | Grad Max: 0.001096
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000338 | Grad Max: 0.001319
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004298 | Grad Max: 0.004298
[GRADIENT NORM TOTAL] 4.6773

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.028
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7982587 0.2017413] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.082
[MASKS] A(Pass/Fail): 748/1300 | B: 646/1402 | C: 600/1448
[LOSS Ex1] A: 0.63108 | B: 0.61241 | C: 0.60652
[LOGITS Ex2 A] Mean Abs: 2.310 | Max: 7.499
[LOSS Ex2] A: 0.10190 | B: 0.31319 | C: 0.21767
** [JOINT LOSS] ** : 0.827590
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009020 | Grad Max: 0.292535
  -> Layer: shared_layers.0.bias | Grad Mean: 0.790358 | Grad Max: 3.739343
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002304 | Grad Max: 0.005757
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004533 | Grad Max: 0.004533
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005048 | Grad Max: 0.837127
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.093799 | Grad Max: 4.649765
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000570 | Grad Max: 0.019663
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.049573 | Grad Max: 0.254771
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000720
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009735 | Grad Max: 0.020284
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000431
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002364 | Grad Max: 0.006615
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001290 | Grad Max: 0.002965
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035539 | Grad Max: 0.035539
[GRADIENT NORM TOTAL] 17.5784

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.142
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000022  0.49999776] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.081
[MASKS] A(Pass/Fail): 723/1325 | B: 608/1248 | C: 579/1469
[LOSS Ex1] A: 0.63816 | B: 0.61660 | C: 0.61075
[LOGITS Ex2 A] Mean Abs: 2.341 | Max: 6.679
[LOSS Ex2] A: 0.09228 | B: 0.31556 | C: 0.22008
** [JOINT LOSS] ** : 0.831138
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005704 | Grad Max: 0.255763
  -> Layer: shared_layers.0.bias | Grad Mean: 0.687486 | Grad Max: 3.456053
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.005613
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002492 | Grad Max: 0.002492
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004593 | Grad Max: 0.797272
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.086076 | Grad Max: 4.467910
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000506 | Grad Max: 0.017860
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.044409 | Grad Max: 0.257190
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000647
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008490 | Grad Max: 0.016977
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000373
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002051 | Grad Max: 0.005779
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001128 | Grad Max: 0.002810
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031276 | Grad Max: 0.031276
[GRADIENT NORM TOTAL] 16.1804

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.805
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7296981  0.27030194] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.082
[MASKS] A(Pass/Fail): 714/1334 | B: 656/1392 | C: 577/1471
[LOSS Ex1] A: 0.63346 | B: 0.61623 | C: 0.61105
[LOGITS Ex2 A] Mean Abs: 2.286 | Max: 6.397
[LOSS Ex2] A: 0.12938 | B: 0.31952 | C: 0.22544
** [JOINT LOSS] ** : 0.845023
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003538 | Grad Max: 0.129949
  -> Layer: shared_layers.0.bias | Grad Mean: 0.249851 | Grad Max: 1.255929
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.006008
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001259 | Grad Max: 0.001259
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001779 | Grad Max: 0.507691
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031229 | Grad Max: 2.803877
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000173 | Grad Max: 0.006619
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014324 | Grad Max: 0.093441
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000285
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002477 | Grad Max: 0.006708
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000150
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000571 | Grad Max: 0.001860
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000296 | Grad Max: 0.001011
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008399 | Grad Max: 0.008399
[GRADIENT NORM TOTAL] 6.5383

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.931
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6362463  0.36375368] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.082
[MASKS] A(Pass/Fail): 590/1026 | B: 644/1404 | C: 567/1481
[LOSS Ex1] A: 0.63162 | B: 0.61673 | C: 0.60957
[LOGITS Ex2 A] Mean Abs: 2.271 | Max: 10.422
[LOSS Ex2] A: 0.10432 | B: 0.33973 | C: 0.22280
** [JOINT LOSS] ** : 0.841587
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010165 | Grad Max: 0.282641
  -> Layer: shared_layers.0.bias | Grad Mean: 0.786371 | Grad Max: 3.568820
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005939
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008858 | Grad Max: 0.008858
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004982 | Grad Max: 0.997391
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.092267 | Grad Max: 5.525179
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000575 | Grad Max: 0.020396
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.049734 | Grad Max: 0.276255
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000712
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009836 | Grad Max: 0.019348
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000479
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002390 | Grad Max: 0.007006
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001425 | Grad Max: 0.003299
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037422 | Grad Max: 0.037422
[GRADIENT NORM TOTAL] 17.1091

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.143
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.508025   0.49197498] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 724/1324 | B: 646/1402 | C: 589/1459
[LOSS Ex1] A: 0.63234 | B: 0.61228 | C: 0.60798
[LOGITS Ex2 A] Mean Abs: 2.237 | Max: 8.135
[LOSS Ex2] A: 0.10480 | B: 0.32722 | C: 0.22659
** [JOINT LOSS] ** : 0.837069
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010824 | Grad Max: 0.303588
  -> Layer: shared_layers.0.bias | Grad Mean: 0.888505 | Grad Max: 3.964028
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.005753
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000069 | Grad Max: 0.000069
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005707 | Grad Max: 0.737754
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.105968 | Grad Max: 4.087673
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000678 | Grad Max: 0.022336
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.059058 | Grad Max: 0.299503
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.000833
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011667 | Grad Max: 0.023381
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000553
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002824 | Grad Max: 0.008246
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001632 | Grad Max: 0.003647
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043353 | Grad Max: 0.043353
[GRADIENT NORM TOTAL] 18.7425

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.078
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50751644 0.49248362] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.082
[MASKS] A(Pass/Fail): 717/1331 | B: 608/1248 | C: 601/1447
[LOSS Ex1] A: 0.62904 | B: 0.61647 | C: 0.61088
[LOGITS Ex2 A] Mean Abs: 2.261 | Max: 7.683
[LOSS Ex2] A: 0.10786 | B: 0.31246 | C: 0.22586
** [JOINT LOSS] ** : 0.834191
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003976 | Grad Max: 0.157316
  -> Layer: shared_layers.0.bias | Grad Mean: 0.458804 | Grad Max: 2.073266
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.006711
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000579 | Grad Max: 0.000579
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002993 | Grad Max: 0.444279
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055942 | Grad Max: 2.479338
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000353 | Grad Max: 0.011360
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030924 | Grad Max: 0.162107
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000461
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005933 | Grad Max: 0.012217
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000263
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001442 | Grad Max: 0.004022
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000833 | Grad Max: 0.002175
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022508 | Grad Max: 0.022508
[GRADIENT NORM TOTAL] 10.1809

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.111
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50833374 0.4916663 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.082
[MASKS] A(Pass/Fail): 717/1331 | B: 656/1392 | C: 580/1468
[LOSS Ex1] A: 0.62801 | B: 0.61611 | C: 0.61178
[LOGITS Ex2 A] Mean Abs: 2.281 | Max: 6.703
[LOSS Ex2] A: 0.12099 | B: 0.31973 | C: 0.21952
** [JOINT LOSS] ** : 0.838712
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008554 | Grad Max: 0.269242
  -> Layer: shared_layers.0.bias | Grad Mean: 0.671972 | Grad Max: 3.141446
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.006321
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002148 | Grad Max: 0.002148
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004412 | Grad Max: 0.819476
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.080487 | Grad Max: 4.584902
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000481 | Grad Max: 0.017567
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.041474 | Grad Max: 0.226098
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000652
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008325 | Grad Max: 0.016591
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000354
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002006 | Grad Max: 0.005410
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001100 | Grad Max: 0.002850
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030023 | Grad Max: 0.030023
[GRADIENT NORM TOTAL] 15.1571

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.919
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.501258   0.49874195] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.560 | Std: 0.081
[MASKS] A(Pass/Fail): 688/1360 | B: 644/1404 | C: 561/1487
[LOSS Ex1] A: 0.63687 | B: 0.61661 | C: 0.61584
[LOGITS Ex2 A] Mean Abs: 2.277 | Max: 6.040
[LOSS Ex2] A: 0.10405 | B: 0.34283 | C: 0.25895
** [JOINT LOSS] ** : 0.858384
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011898 | Grad Max: 0.431039
  -> Layer: shared_layers.0.bias | Grad Mean: 1.109747 | Grad Max: 5.819005
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002024 | Grad Max: 0.005277
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006799 | Grad Max: 0.006799
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007114 | Grad Max: 1.320729
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.131764 | Grad Max: 7.347874
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000758 | Grad Max: 0.025064
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.066136 | Grad Max: 0.343326
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001027
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013011 | Grad Max: 0.027267
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000577
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003096 | Grad Max: 0.009182
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001652 | Grad Max: 0.003400
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044887 | Grad Max: 0.044887
[GRADIENT NORM TOTAL] 25.0426

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.825
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54357105 0.45642892] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.078
[MASKS] A(Pass/Fail): 688/1360 | B: 647/1401 | C: 574/1474
[LOSS Ex1] A: 0.63736 | B: 0.61217 | C: 0.61002
[LOGITS Ex2 A] Mean Abs: 2.250 | Max: 6.199
[LOSS Ex2] A: 0.11908 | B: 0.30508 | C: 0.22787
** [JOINT LOSS] ** : 0.837191
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009388 | Grad Max: 0.339843
  -> Layer: shared_layers.0.bias | Grad Mean: 0.805733 | Grad Max: 4.273692
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.005494
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005268 | Grad Max: 0.005268
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005150 | Grad Max: 1.011205
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.094589 | Grad Max: 5.631574
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000560 | Grad Max: 0.018300
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048752 | Grad Max: 0.247341
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000704
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009694 | Grad Max: 0.018758
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000458
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002349 | Grad Max: 0.006670
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001251 | Grad Max: 0.003118
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034501 | Grad Max: 0.034501
[GRADIENT NORM TOTAL] 18.0836

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.031
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.79907197 0.200928  ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.082
[MASKS] A(Pass/Fail): 750/1298 | B: 608/1248 | C: 583/1465
[LOSS Ex1] A: 0.63093 | B: 0.61636 | C: 0.60907
[LOGITS Ex2 A] Mean Abs: 2.221 | Max: 7.841
[LOSS Ex2] A: 0.11288 | B: 0.30435 | C: 0.21591
** [JOINT LOSS] ** : 0.829831
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004184 | Grad Max: 0.151842
  -> Layer: shared_layers.0.bias | Grad Mean: 0.136723 | Grad Max: 0.529207
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.005647
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000640 | Grad Max: 0.000640
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001030 | Grad Max: 0.215933
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017103 | Grad Max: 1.198506
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000081 | Grad Max: 0.005083
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005272 | Grad Max: 0.047567
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000157
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000743 | Grad Max: 0.003360
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000073
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000144 | Grad Max: 0.000852
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.000983
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001449 | Grad Max: 0.001449
[GRADIENT NORM TOTAL] 3.1318

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.146
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000136  0.49998638] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.081
[MASKS] A(Pass/Fail): 723/1325 | B: 656/1392 | C: 570/1478
[LOSS Ex1] A: 0.63802 | B: 0.61600 | C: 0.61062
[LOGITS Ex2 A] Mean Abs: 2.225 | Max: 6.652
[LOSS Ex2] A: 0.09023 | B: 0.33401 | C: 0.20457
** [JOINT LOSS] ** : 0.831149
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005791 | Grad Max: 0.200564
  -> Layer: shared_layers.0.bias | Grad Mean: 0.517420 | Grad Max: 2.775786
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.005836
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001847 | Grad Max: 0.001847
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003287 | Grad Max: 0.688409
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059880 | Grad Max: 3.866540
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.012739
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031187 | Grad Max: 0.164926
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000461
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006295 | Grad Max: 0.012501
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000305
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001521 | Grad Max: 0.004541
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000872 | Grad Max: 0.002626
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023129 | Grad Max: 0.023129
[GRADIENT NORM TOTAL] 11.6360

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.808
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7303075 0.2696925] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.082
[MASKS] A(Pass/Fail): 714/1334 | B: 645/1403 | C: 362/1014
[LOSS Ex1] A: 0.63331 | B: 0.61651 | C: 0.62121
[LOGITS Ex2 A] Mean Abs: 2.228 | Max: 8.244
[LOSS Ex2] A: 0.11683 | B: 0.31819 | C: 0.25745
** [JOINT LOSS] ** : 0.854500
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005799 | Grad Max: 0.175994
  -> Layer: shared_layers.0.bias | Grad Mean: 0.348157 | Grad Max: 1.968011
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.005664
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010299 | Grad Max: 0.010299
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.605284
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038378 | Grad Max: 3.379683
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000224 | Grad Max: 0.007793
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019051 | Grad Max: 0.092470
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000372
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003849 | Grad Max: 0.008297
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000209
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000974 | Grad Max: 0.002800
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000679 | Grad Max: 0.001717
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017521 | Grad Max: 0.017521
[GRADIENT NORM TOTAL] 7.7720

[EPOCH SUMMARY] Train Loss: 0.8390

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8193 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 152/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.934
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63655245 0.3634475 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.082
[MASKS] A(Pass/Fail): 590/1026 | B: 647/1401 | C: 561/1487
[LOSS Ex1] A: 0.63147 | B: 0.61205 | C: 0.61228
[LOGITS Ex2 A] Mean Abs: 2.298 | Max: 13.373
[LOSS Ex2] A: 0.10686 | B: 0.30889 | C: 0.21518
** [JOINT LOSS] ** : 0.828916
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005457 | Grad Max: 0.173669
  -> Layer: shared_layers.0.bias | Grad Mean: 0.495622 | Grad Max: 2.131582
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.005513
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004311 | Grad Max: 0.004311
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003357 | Grad Max: 0.406862
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060532 | Grad Max: 2.254767
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000343 | Grad Max: 0.014922
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030052 | Grad Max: 0.197044
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000462
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005504 | Grad Max: 0.011462
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000258
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001321 | Grad Max: 0.003846
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000632 | Grad Max: 0.001957
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019066 | Grad Max: 0.019066
[GRADIENT NORM TOTAL] 10.8484

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.147
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50803024 0.49196982] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 724/1324 | B: 608/1248 | C: 562/1486
[LOSS Ex1] A: 0.63220 | B: 0.61624 | C: 0.61288
[LOGITS Ex2 A] Mean Abs: 2.297 | Max: 9.391
[LOSS Ex2] A: 0.10037 | B: 0.30304 | C: 0.23481
** [JOINT LOSS] ** : 0.833180
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005347 | Grad Max: 0.217880
  -> Layer: shared_layers.0.bias | Grad Mean: 0.441438 | Grad Max: 1.986543
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.005741
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001087 | Grad Max: 0.001087
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003116 | Grad Max: 0.382197
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056290 | Grad Max: 2.031134
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000325 | Grad Max: 0.011591
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028273 | Grad Max: 0.170345
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000445
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005251 | Grad Max: 0.011290
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000236
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001256 | Grad Max: 0.003617
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000621 | Grad Max: 0.001842
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018391 | Grad Max: 0.018391
[GRADIENT NORM TOTAL] 10.0786

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.082
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50752765 0.49247238] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.082
[MASKS] A(Pass/Fail): 717/1331 | B: 656/1392 | C: 568/1480
[LOSS Ex1] A: 0.62889 | B: 0.61589 | C: 0.60978
[LOGITS Ex2 A] Mean Abs: 2.278 | Max: 6.451
[LOSS Ex2] A: 0.10789 | B: 0.32146 | C: 0.20706
** [JOINT LOSS] ** : 0.830326
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003653 | Grad Max: 0.143095
  -> Layer: shared_layers.0.bias | Grad Mean: 0.146542 | Grad Max: 0.830313
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.006506
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006131 | Grad Max: 0.006131
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000851 | Grad Max: 0.583100
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014159 | Grad Max: 3.224070
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.002082
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002550 | Grad Max: 0.024211
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000196
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000578 | Grad Max: 0.002755
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000086
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000155 | Grad Max: 0.000922
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000307 | Grad Max: 0.001198
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002844 | Grad Max: 0.002844
[GRADIENT NORM TOTAL] 4.6242

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.115
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50838417 0.4916158 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.082
[MASKS] A(Pass/Fail): 717/1331 | B: 645/1403 | C: 588/1460
[LOSS Ex1] A: 0.62786 | B: 0.61639 | C: 0.60812
[LOGITS Ex2 A] Mean Abs: 2.221 | Max: 8.410
[LOSS Ex2] A: 0.11564 | B: 0.32180 | C: 0.21617
** [JOINT LOSS] ** : 0.835326
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003996 | Grad Max: 0.124445
  -> Layer: shared_layers.0.bias | Grad Mean: 0.316463 | Grad Max: 1.477840
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.006334
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001828 | Grad Max: 0.001828
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001997 | Grad Max: 0.242869
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036701 | Grad Max: 1.353868
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.007237
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017498 | Grad Max: 0.091919
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000330
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003462 | Grad Max: 0.007748
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000233
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000876 | Grad Max: 0.003306
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000539 | Grad Max: 0.002171
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014585 | Grad Max: 0.014585
[GRADIENT NORM TOTAL] 6.7530

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.923
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012295  0.49877048] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.081
[MASKS] A(Pass/Fail): 688/1360 | B: 647/1401 | C: 589/1459
[LOSS Ex1] A: 0.63673 | B: 0.61194 | C: 0.60922
[LOGITS Ex2 A] Mean Abs: 2.181 | Max: 5.920
[LOSS Ex2] A: 0.10234 | B: 0.29548 | C: 0.22460
** [JOINT LOSS] ** : 0.826765
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002653 | Grad Max: 0.064343
  -> Layer: shared_layers.0.bias | Grad Mean: 0.145974 | Grad Max: 0.706828
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.006234
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009743 | Grad Max: 0.009743
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001080 | Grad Max: 0.183179
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019391 | Grad Max: 1.017213
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.004529
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009528 | Grad Max: 0.053411
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000192
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001776 | Grad Max: 0.004876
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000142
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000432 | Grad Max: 0.001745
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000421 | Grad Max: 0.001797
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007732 | Grad Max: 0.007732
[GRADIENT NORM TOTAL] 3.6728

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.828
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5435763 0.4564237] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.078
[MASKS] A(Pass/Fail): 688/1360 | B: 608/1248 | C: 593/1455
[LOSS Ex1] A: 0.63722 | B: 0.61612 | C: 0.60387
[LOGITS Ex2 A] Mean Abs: 2.234 | Max: 5.961
[LOSS Ex2] A: 0.11484 | B: 0.30163 | C: 0.21732
** [JOINT LOSS] ** : 0.830330
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007294 | Grad Max: 0.247386
  -> Layer: shared_layers.0.bias | Grad Mean: 0.579786 | Grad Max: 3.179410
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.005739
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007377 | Grad Max: 0.007377
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003657 | Grad Max: 0.723002
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.067811 | Grad Max: 4.029886
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000397 | Grad Max: 0.012938
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034291 | Grad Max: 0.181128
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000511
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006822 | Grad Max: 0.013885
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000335
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001685 | Grad Max: 0.004445
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000929 | Grad Max: 0.002513
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025458 | Grad Max: 0.025458
[GRADIENT NORM TOTAL] 13.0304

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.035
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7998073  0.20019267] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.082
[MASKS] A(Pass/Fail): 750/1298 | B: 656/1392 | C: 584/1464
[LOSS Ex1] A: 0.63077 | B: 0.61577 | C: 0.61039
[LOGITS Ex2 A] Mean Abs: 2.259 | Max: 7.347
[LOSS Ex2] A: 0.10885 | B: 0.32002 | C: 0.23630
** [JOINT LOSS] ** : 0.840702
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010097 | Grad Max: 0.257659
  -> Layer: shared_layers.0.bias | Grad Mean: 0.666161 | Grad Max: 3.093738
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.005990
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000597 | Grad Max: 0.000597
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004334 | Grad Max: 0.822711
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.080152 | Grad Max: 4.596558
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000505 | Grad Max: 0.015973
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.043609 | Grad Max: 0.215492
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000674
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008688 | Grad Max: 0.017676
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000415
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002116 | Grad Max: 0.006195
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001151 | Grad Max: 0.002819
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031240 | Grad Max: 0.031240
[GRADIENT NORM TOTAL] 14.7961

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.150
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50003743 0.49996254] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 723/1325 | B: 645/1403 | C: 596/1452
[LOSS Ex1] A: 0.63786 | B: 0.61627 | C: 0.60775
[LOGITS Ex2 A] Mean Abs: 2.271 | Max: 6.306
[LOSS Ex2] A: 0.09850 | B: 0.30807 | C: 0.21832
** [JOINT LOSS] ** : 0.828924
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004477 | Grad Max: 0.107407
  -> Layer: shared_layers.0.bias | Grad Mean: 0.195126 | Grad Max: 0.891397
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.005417
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002383 | Grad Max: 0.002383
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001441 | Grad Max: 0.405407
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026162 | Grad Max: 2.251768
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000142 | Grad Max: 0.004571
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011842 | Grad Max: 0.060067
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000233
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002484 | Grad Max: 0.006139
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000155
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000646 | Grad Max: 0.002170
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000417 | Grad Max: 0.001322
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009600 | Grad Max: 0.009600
[GRADIENT NORM TOTAL] 5.1166

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.811
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.73086554 0.26913443] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.082
[MASKS] A(Pass/Fail): 714/1334 | B: 648/1400 | C: 570/1478
[LOSS Ex1] A: 0.63314 | B: 0.61181 | C: 0.61333
[LOGITS Ex2 A] Mean Abs: 2.181 | Max: 6.727
[LOSS Ex2] A: 0.11901 | B: 0.31572 | C: 0.21062
** [JOINT LOSS] ** : 0.834546
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007148 | Grad Max: 0.256460
  -> Layer: shared_layers.0.bias | Grad Mean: 0.733536 | Grad Max: 3.509427
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.006053
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009260 | Grad Max: 0.009260
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004730 | Grad Max: 0.781187
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.088095 | Grad Max: 4.386971
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000546 | Grad Max: 0.018476
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048221 | Grad Max: 0.262483
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000751
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009336 | Grad Max: 0.020172
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000452
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002251 | Grad Max: 0.007085
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001209 | Grad Max: 0.003141
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033623 | Grad Max: 0.033623
[GRADIENT NORM TOTAL] 16.1767

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.938
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6367706  0.36322936] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.083
[MASKS] A(Pass/Fail): 590/1026 | B: 608/1248 | C: 574/1474
[LOSS Ex1] A: 0.63129 | B: 0.61600 | C: 0.61383
[LOGITS Ex2 A] Mean Abs: 2.209 | Max: 9.394
[LOSS Ex2] A: 0.11230 | B: 0.34648 | C: 0.24307
** [JOINT LOSS] ** : 0.854322
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008503 | Grad Max: 0.340423
  -> Layer: shared_layers.0.bias | Grad Mean: 0.978707 | Grad Max: 4.525155
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.006051
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003400 | Grad Max: 0.003400
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006172 | Grad Max: 0.845343
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.115138 | Grad Max: 4.747772
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000735 | Grad Max: 0.024511
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.065453 | Grad Max: 0.331815
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.000908
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012840 | Grad Max: 0.025138
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000584
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003145 | Grad Max: 0.008764
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001673 | Grad Max: 0.003620
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047505 | Grad Max: 0.047505
[GRADIENT NORM TOTAL] 21.0842

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.151
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50807613 0.49192384] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 724/1324 | B: 656/1392 | C: 567/1481
[LOSS Ex1] A: 0.63202 | B: 0.61566 | C: 0.61403
[LOGITS Ex2 A] Mean Abs: 2.230 | Max: 8.092
[LOSS Ex2] A: 0.10005 | B: 0.34093 | C: 0.23409
** [JOINT LOSS] ** : 0.845595
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004763 | Grad Max: 0.235202
  -> Layer: shared_layers.0.bias | Grad Mean: 0.552336 | Grad Max: 3.177644
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.005950
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001820 | Grad Max: 0.001820
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003490 | Grad Max: 0.659501
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064662 | Grad Max: 3.711680
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000385 | Grad Max: 0.013048
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034443 | Grad Max: 0.176715
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000541
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006703 | Grad Max: 0.014614
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000312
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001647 | Grad Max: 0.005036
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000894 | Grad Max: 0.002505
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025207 | Grad Max: 0.025207
[GRADIENT NORM TOTAL] 13.0881

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50740045 0.49259952] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 717/1331 | B: 645/1403 | C: 569/1479
[LOSS Ex1] A: 0.62871 | B: 0.61616 | C: 0.61103
[LOGITS Ex2 A] Mean Abs: 2.261 | Max: 6.750
[LOSS Ex2] A: 0.11140 | B: 0.31472 | C: 0.21630
** [JOINT LOSS] ** : 0.832776
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004154 | Grad Max: 0.146170
  -> Layer: shared_layers.0.bias | Grad Mean: 0.352488 | Grad Max: 1.840593
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006062
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000933 | Grad Max: 0.000933
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002451 | Grad Max: 0.508351
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045527 | Grad Max: 2.846020
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.007116
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021430 | Grad Max: 0.101610
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000368
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004163 | Grad Max: 0.009193
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001008 | Grad Max: 0.003011
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000524 | Grad Max: 0.001864
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014692 | Grad Max: 0.014692
[GRADIENT NORM TOTAL] 8.4936

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.119
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50849265 0.49150735] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.082
[MASKS] A(Pass/Fail): 717/1331 | B: 648/1400 | C: 570/1478
[LOSS Ex1] A: 0.62768 | B: 0.61170 | C: 0.61551
[LOGITS Ex2 A] Mean Abs: 2.255 | Max: 7.774
[LOSS Ex2] A: 0.11817 | B: 0.31445 | C: 0.23903
** [JOINT LOSS] ** : 0.842181
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007535 | Grad Max: 0.259646
  -> Layer: shared_layers.0.bias | Grad Mean: 0.726010 | Grad Max: 3.326264
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.006212
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004580 | Grad Max: 0.004580
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004844 | Grad Max: 0.964914
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.089828 | Grad Max: 5.396534
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000553 | Grad Max: 0.015983
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048983 | Grad Max: 0.236670
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000708
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009651 | Grad Max: 0.019401
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000408
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002325 | Grad Max: 0.006333
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001195 | Grad Max: 0.002776
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033823 | Grad Max: 0.033823
[GRADIENT NORM TOTAL] 16.3822

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.926
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50124663 0.4987534 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.081
[MASKS] A(Pass/Fail): 688/1360 | B: 608/1248 | C: 430/946
[LOSS Ex1] A: 0.63655 | B: 0.61590 | C: 0.60226
[LOGITS Ex2 A] Mean Abs: 2.215 | Max: 6.590
[LOSS Ex2] A: 0.10543 | B: 0.30119 | C: 0.20923
** [JOINT LOSS] ** : 0.823522
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002768 | Grad Max: 0.111836
  -> Layer: shared_layers.0.bias | Grad Mean: 0.246830 | Grad Max: 1.527001
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.005638
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004570 | Grad Max: 0.004570
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001650 | Grad Max: 0.264622
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029936 | Grad Max: 1.469090
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000146 | Grad Max: 0.007582
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012700 | Grad Max: 0.084712
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000233
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002396 | Grad Max: 0.005929
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000140
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000604 | Grad Max: 0.001914
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.001398
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008744 | Grad Max: 0.008744
[GRADIENT NORM TOTAL] 5.8505

[EPOCH SUMMARY] Train Loss: 0.8348

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8191 | Alpha: 0.5500
No improve count: 3/15

############################## EPOCH 153/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.831
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54353017 0.45646977] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.559 | Std: 0.078
[MASKS] A(Pass/Fail): 688/1360 | B: 656/1392 | C: 599/1449
[LOSS Ex1] A: 0.63706 | B: 0.61556 | C: 0.60683
[LOGITS Ex2 A] Mean Abs: 2.148 | Max: 6.852
[LOSS Ex2] A: 0.10949 | B: 0.32987 | C: 0.20835
** [JOINT LOSS] ** : 0.835717
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006632 | Grad Max: 0.171242
  -> Layer: shared_layers.0.bias | Grad Mean: 0.511304 | Grad Max: 2.288469
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.005759
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008658 | Grad Max: 0.008658
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003512 | Grad Max: 0.868056
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064455 | Grad Max: 4.803056
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000376 | Grad Max: 0.012559
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.032782 | Grad Max: 0.163594
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000517
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006561 | Grad Max: 0.014179
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000361
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001585 | Grad Max: 0.005367
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000889 | Grad Max: 0.002670
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024111 | Grad Max: 0.024111
[GRADIENT NORM TOTAL] 12.1187

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.039
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8005527 0.1994473] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.083
[MASKS] A(Pass/Fail): 750/1298 | B: 645/1403 | C: 569/1479
[LOSS Ex1] A: 0.63061 | B: 0.61605 | C: 0.61226
[LOGITS Ex2 A] Mean Abs: 2.187 | Max: 6.090
[LOSS Ex2] A: 0.10188 | B: 0.32635 | C: 0.22050
** [JOINT LOSS] ** : 0.835888
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004257 | Grad Max: 0.222570
  -> Layer: shared_layers.0.bias | Grad Mean: 0.575908 | Grad Max: 2.820241
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002109 | Grad Max: 0.006238
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000955 | Grad Max: 0.000955
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003712 | Grad Max: 0.847083
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069397 | Grad Max: 4.689932
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000418 | Grad Max: 0.013599
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037634 | Grad Max: 0.189873
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000595
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007313 | Grad Max: 0.015528
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000386
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001774 | Grad Max: 0.005724
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000958 | Grad Max: 0.002754
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026793 | Grad Max: 0.026793
[GRADIENT NORM TOTAL] 13.2141

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.154
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000642  0.49993578] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.081
[MASKS] A(Pass/Fail): 723/1325 | B: 648/1400 | C: 598/1450
[LOSS Ex1] A: 0.63772 | B: 0.61160 | C: 0.61127
[LOGITS Ex2 A] Mean Abs: 2.211 | Max: 6.384
[LOSS Ex2] A: 0.09957 | B: 0.29309 | C: 0.22728
** [JOINT LOSS] ** : 0.826838
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002842 | Grad Max: 0.080170
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141178 | Grad Max: 0.585776
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005148
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001837 | Grad Max: 0.001837
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001054 | Grad Max: 0.615192
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018772 | Grad Max: 3.414342
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.003104
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004728 | Grad Max: 0.028755
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000184
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000954 | Grad Max: 0.003609
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000091
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000221 | Grad Max: 0.001181
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000406 | Grad Max: 0.001592
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004183 | Grad Max: 0.004183
[GRADIENT NORM TOTAL] 5.2314

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.813
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7313856  0.26861444] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.082
[MASKS] A(Pass/Fail): 713/1335 | B: 608/1248 | C: 579/1469
[LOSS Ex1] A: 0.63299 | B: 0.61579 | C: 0.61195
[LOGITS Ex2 A] Mean Abs: 2.212 | Max: 6.461
[LOSS Ex2] A: 0.11579 | B: 0.30684 | C: 0.23140
** [JOINT LOSS] ** : 0.838258
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004304 | Grad Max: 0.189417
  -> Layer: shared_layers.0.bias | Grad Mean: 0.456047 | Grad Max: 2.481765
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002090 | Grad Max: 0.005495
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004815 | Grad Max: 0.004815
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003028 | Grad Max: 0.581367
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055752 | Grad Max: 3.257293
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000315 | Grad Max: 0.012178
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027978 | Grad Max: 0.145476
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000393
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005502 | Grad Max: 0.011682
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000307
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001343 | Grad Max: 0.004056
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000713 | Grad Max: 0.002214
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020502 | Grad Max: 0.020502
[GRADIENT NORM TOTAL] 10.6662

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 0.940
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63697577 0.36302426] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.083
[MASKS] A(Pass/Fail): 590/1026 | B: 656/1392 | C: 580/1468
[LOSS Ex1] A: 0.63114 | B: 0.61546 | C: 0.61003
[LOGITS Ex2 A] Mean Abs: 2.276 | Max: 9.944
[LOSS Ex2] A: 0.11038 | B: 0.31609 | C: 0.22633
** [JOINT LOSS] ** : 0.836478
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004582 | Grad Max: 0.179477
  -> Layer: shared_layers.0.bias | Grad Mean: 0.481807 | Grad Max: 2.393337
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.006155
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004277 | Grad Max: 0.004277
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003022 | Grad Max: 0.595395
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056510 | Grad Max: 3.322370
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000337 | Grad Max: 0.011269
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029815 | Grad Max: 0.142092
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000461
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005741 | Grad Max: 0.012139
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000283
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001374 | Grad Max: 0.004168
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000714 | Grad Max: 0.002043
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020125 | Grad Max: 0.020125
[GRADIENT NORM TOTAL] 10.7058

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.155
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080868  0.49191314] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.083
[MASKS] A(Pass/Fail): 724/1324 | B: 644/1404 | C: 592/1456
[LOSS Ex1] A: 0.63187 | B: 0.61595 | C: 0.60509
[LOGITS Ex2 A] Mean Abs: 2.209 | Max: 8.015
[LOSS Ex2] A: 0.10016 | B: 0.31746 | C: 0.20784
** [JOINT LOSS] ** : 0.826127
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002870 | Grad Max: 0.078955
  -> Layer: shared_layers.0.bias | Grad Mean: 0.098221 | Grad Max: 0.683431
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.005708
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000468 | Grad Max: 0.000468
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000864 | Grad Max: 0.171044
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015247 | Grad Max: 0.926127
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.005291
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005803 | Grad Max: 0.079131
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000163
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001028 | Grad Max: 0.003771
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000111
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000247 | Grad Max: 0.001018
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000448 | Grad Max: 0.001455
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004226 | Grad Max: 0.004226
[GRADIENT NORM TOTAL] 2.8365

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.090
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073296  0.49267048] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 717/1331 | B: 648/1400 | C: 568/1480
[LOSS Ex1] A: 0.62856 | B: 0.61149 | C: 0.61208
[LOGITS Ex2 A] Mean Abs: 2.220 | Max: 7.130
[LOSS Ex2] A: 0.10179 | B: 0.30152 | C: 0.20733
** [JOINT LOSS] ** : 0.820923
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002528 | Grad Max: 0.061896
  -> Layer: shared_layers.0.bias | Grad Mean: 0.191037 | Grad Max: 0.890903
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.005742
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000374 | Grad Max: 0.000374
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001346 | Grad Max: 0.374611
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024010 | Grad Max: 2.090009
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000113 | Grad Max: 0.005000
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009744 | Grad Max: 0.063004
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000216
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001862 | Grad Max: 0.005601
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000119
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000444 | Grad Max: 0.001684
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000390 | Grad Max: 0.001440
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006716 | Grad Max: 0.006716
[GRADIENT NORM TOTAL] 4.9425

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.123
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085858 0.4914142] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.082
[MASKS] A(Pass/Fail): 718/1330 | B: 609/1247 | C: 580/1468
[LOSS Ex1] A: 0.62752 | B: 0.61568 | C: 0.61075
[LOGITS Ex2 A] Mean Abs: 2.209 | Max: 8.798
[LOSS Ex2] A: 0.11303 | B: 0.29763 | C: 0.22910
** [JOINT LOSS] ** : 0.831240
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003589 | Grad Max: 0.131314
  -> Layer: shared_layers.0.bias | Grad Mean: 0.142613 | Grad Max: 0.521141
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006236
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005399 | Grad Max: 0.005399
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001327 | Grad Max: 0.317898
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023490 | Grad Max: 1.775632
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000129 | Grad Max: 0.004779
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010693 | Grad Max: 0.062502
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000241
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002230 | Grad Max: 0.005516
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000136
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000537 | Grad Max: 0.001974
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000365 | Grad Max: 0.001342
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007173 | Grad Max: 0.007173
[GRADIENT NORM TOTAL] 4.1686

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.930
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50124466 0.4987553 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.081
[MASKS] A(Pass/Fail): 688/1360 | B: 658/1390 | C: 564/1484
[LOSS Ex1] A: 0.63639 | B: 0.61534 | C: 0.61437
[LOGITS Ex2 A] Mean Abs: 2.179 | Max: 5.795
[LOSS Ex2] A: 0.10330 | B: 0.32384 | C: 0.22196
** [JOINT LOSS] ** : 0.838400
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003217 | Grad Max: 0.105462
  -> Layer: shared_layers.0.bias | Grad Mean: 0.190566 | Grad Max: 1.194301
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.005273
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004954 | Grad Max: 0.004954
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001314 | Grad Max: 0.544803
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023628 | Grad Max: 3.063382
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.005527
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008775 | Grad Max: 0.063388
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000281
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001866 | Grad Max: 0.006342
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000108
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000496 | Grad Max: 0.001550
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000350 | Grad Max: 0.001523
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008565 | Grad Max: 0.008565
[GRADIENT NORM TOTAL] 5.7033

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.834
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5434653 0.4565347] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.078
[MASKS] A(Pass/Fail): 688/1360 | B: 644/1404 | C: 531/1517
[LOSS Ex1] A: 0.63690 | B: 0.61582 | C: 0.61645
[LOGITS Ex2 A] Mean Abs: 2.206 | Max: 6.629
[LOSS Ex2] A: 0.10932 | B: 0.31452 | C: 0.22106
** [JOINT LOSS] ** : 0.838024
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004125 | Grad Max: 0.153458
  -> Layer: shared_layers.0.bias | Grad Mean: 0.138999 | Grad Max: 0.768375
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.005536
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007907 | Grad Max: 0.007907
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001015 | Grad Max: 0.586894
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016323 | Grad Max: 3.263661
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.002760
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002397 | Grad Max: 0.018473
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000155
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000380 | Grad Max: 0.002194
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000066
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000095 | Grad Max: 0.000781
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000374 | Grad Max: 0.001188
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001684 | Grad Max: 0.001684
[GRADIENT NORM TOTAL] 4.9136

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.043
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.80136406 0.19863589] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.083
[MASKS] A(Pass/Fail): 750/1298 | B: 649/1399 | C: 575/1473
[LOSS Ex1] A: 0.63044 | B: 0.61135 | C: 0.60607
[LOGITS Ex2 A] Mean Abs: 2.226 | Max: 7.261
[LOSS Ex2] A: 0.10038 | B: 0.29640 | C: 0.19476
** [JOINT LOSS] ** : 0.813134
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003389 | Grad Max: 0.119697
  -> Layer: shared_layers.0.bias | Grad Mean: 0.243384 | Grad Max: 1.336020
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002275 | Grad Max: 0.005903
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001879 | Grad Max: 0.001879
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001498 | Grad Max: 0.316151
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027422 | Grad Max: 1.755834
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.006507
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010783 | Grad Max: 0.058504
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000222
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002071 | Grad Max: 0.005862
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000135
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000541 | Grad Max: 0.001846
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000558 | Grad Max: 0.001689
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008467 | Grad Max: 0.008467
[GRADIENT NORM TOTAL] 5.6746

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.158
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50010705 0.49989295] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.082
[MASKS] A(Pass/Fail): 723/1325 | B: 609/1247 | C: 612/1436
[LOSS Ex1] A: 0.63754 | B: 0.61554 | C: 0.61087
[LOGITS Ex2 A] Mean Abs: 2.235 | Max: 5.301
[LOSS Ex2] A: 0.09857 | B: 0.30594 | C: 0.21067
** [JOINT LOSS] ** : 0.826378
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003211 | Grad Max: 0.141695
  -> Layer: shared_layers.0.bias | Grad Mean: 0.158788 | Grad Max: 0.656063
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.005467
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002540 | Grad Max: 0.002540
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001200 | Grad Max: 0.223541
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020388 | Grad Max: 1.211584
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000120 | Grad Max: 0.004502
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010153 | Grad Max: 0.056926
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000225
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002215 | Grad Max: 0.005648
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000137
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000544 | Grad Max: 0.001737
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000379 | Grad Max: 0.001582
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008367 | Grad Max: 0.008367
[GRADIENT NORM TOTAL] 3.9360

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.817
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7320677 0.2679323] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.083
[MASKS] A(Pass/Fail): 713/1335 | B: 658/1390 | C: 578/1470
[LOSS Ex1] A: 0.63280 | B: 0.61520 | C: 0.60724
[LOGITS Ex2 A] Mean Abs: 2.235 | Max: 7.355
[LOSS Ex2] A: 0.11019 | B: 0.31713 | C: 0.23515
** [JOINT LOSS] ** : 0.839239
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002400 | Grad Max: 0.044911
  -> Layer: shared_layers.0.bias | Grad Mean: 0.135205 | Grad Max: 0.633643
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.005690
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007344 | Grad Max: 0.007344
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000914 | Grad Max: 0.205855
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015678 | Grad Max: 1.158123
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.002977
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002946 | Grad Max: 0.027623
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000162
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000529 | Grad Max: 0.002917
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000117 | Grad Max: 0.000666
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000277 | Grad Max: 0.000809
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000212 | Grad Max: 0.000212
[GRADIENT NORM TOTAL] 3.5015

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 0.944
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63726795 0.362732  ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.083
[MASKS] A(Pass/Fail): 590/1026 | B: 644/1404 | C: 412/964
[LOSS Ex1] A: 0.63094 | B: 0.61567 | C: 0.60456
[LOGITS Ex2 A] Mean Abs: 2.283 | Max: 8.866
[LOSS Ex2] A: 0.11252 | B: 0.31189 | C: 0.24651
** [JOINT LOSS] ** : 0.840693
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003181 | Grad Max: 0.079816
  -> Layer: shared_layers.0.bias | Grad Mean: 0.158554 | Grad Max: 0.697370
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.006488
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001968 | Grad Max: 0.001968
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001256 | Grad Max: 0.189290
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021664 | Grad Max: 1.059678
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000095 | Grad Max: 0.004629
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008042 | Grad Max: 0.046414
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000238
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001721 | Grad Max: 0.005154
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000163
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000365 | Grad Max: 0.001774
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000219 | Grad Max: 0.000967
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003701 | Grad Max: 0.003701
[GRADIENT NORM TOTAL] 3.8261

[EPOCH SUMMARY] Train Loss: 0.8320

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8149 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8174 -> New: 0.8149)

############################## EPOCH 154/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.160
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50810784 0.49189216] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.083
[MASKS] A(Pass/Fail): 723/1325 | B: 649/1399 | C: 610/1438
[LOSS Ex1] A: 0.63166 | B: 0.61119 | C: 0.60608
[LOGITS Ex2 A] Mean Abs: 2.233 | Max: 8.142
[LOSS Ex2] A: 0.09318 | B: 0.29514 | C: 0.23818
** [JOINT LOSS] ** : 0.825142
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004907 | Grad Max: 0.179034
  -> Layer: shared_layers.0.bias | Grad Mean: 0.307258 | Grad Max: 1.873354
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.005643
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006280 | Grad Max: 0.006280
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001960 | Grad Max: 0.517433
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034083 | Grad Max: 2.814525
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000175 | Grad Max: 0.007787
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014477 | Grad Max: 0.114202
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000238
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002428 | Grad Max: 0.005893
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000130
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000605 | Grad Max: 0.001880
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000368 | Grad Max: 0.001656
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010265 | Grad Max: 0.010265
[GRADIENT NORM TOTAL] 7.3468

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.095
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50721604 0.49278393] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 717/1331 | B: 610/1246 | C: 579/1469
[LOSS Ex1] A: 0.62834 | B: 0.61536 | C: 0.61028
[LOGITS Ex2 A] Mean Abs: 2.244 | Max: 5.819
[LOSS Ex2] A: 0.09987 | B: 0.30097 | C: 0.21229
** [JOINT LOSS] ** : 0.822372
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004136 | Grad Max: 0.130517
  -> Layer: shared_layers.0.bias | Grad Mean: 0.319797 | Grad Max: 1.728895
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002263 | Grad Max: 0.006532
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008431 | Grad Max: 0.008431
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002004 | Grad Max: 0.479807
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035531 | Grad Max: 2.643390
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000172 | Grad Max: 0.007256
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014995 | Grad Max: 0.095342
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000270
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002652 | Grad Max: 0.006156
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000150
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000650 | Grad Max: 0.002421
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000353 | Grad Max: 0.001693
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009251 | Grad Max: 0.009251
[GRADIENT NORM TOTAL] 7.3641

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.128
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50867635 0.49132368] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 717/1331 | B: 658/1390 | C: 611/1437
[LOSS Ex1] A: 0.62730 | B: 0.61502 | C: 0.60581
[LOGITS Ex2 A] Mean Abs: 2.235 | Max: 7.679
[LOSS Ex2] A: 0.12457 | B: 0.31843 | C: 0.22714
** [JOINT LOSS] ** : 0.839425
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006673 | Grad Max: 0.240954
  -> Layer: shared_layers.0.bias | Grad Mean: 0.300468 | Grad Max: 1.259275
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006034
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000119 | Grad Max: 0.000119
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002205 | Grad Max: 0.469604
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038965 | Grad Max: 2.610384
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.008933
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020853 | Grad Max: 0.116698
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000408
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004363 | Grad Max: 0.009394
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000234
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001065 | Grad Max: 0.003459
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000573 | Grad Max: 0.001656
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015278 | Grad Max: 0.015278
[GRADIENT NORM TOTAL] 6.9682

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.934
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50122535 0.4987746 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.082
[MASKS] A(Pass/Fail): 688/1360 | B: 645/1403 | C: 613/1435
[LOSS Ex1] A: 0.63618 | B: 0.61549 | C: 0.60391
[LOGITS Ex2 A] Mean Abs: 2.188 | Max: 7.339
[LOSS Ex2] A: 0.10779 | B: 0.31177 | C: 0.22155
** [JOINT LOSS] ** : 0.832227
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002443 | Grad Max: 0.094536
  -> Layer: shared_layers.0.bias | Grad Mean: 0.171083 | Grad Max: 1.109347
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.005823
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002673 | Grad Max: 0.002673
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001268 | Grad Max: 0.329060
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022721 | Grad Max: 1.809894
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.004336
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007314 | Grad Max: 0.052977
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000193
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001365 | Grad Max: 0.004297
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000104
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000342 | Grad Max: 0.001285
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000392 | Grad Max: 0.001189
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004928 | Grad Max: 0.004928
[GRADIENT NORM TOTAL] 4.7872

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.837
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5433875 0.4566126] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.079
[MASKS] A(Pass/Fail): 688/1360 | B: 648/1400 | C: 570/1478
[LOSS Ex1] A: 0.63669 | B: 0.61101 | C: 0.61257
[LOGITS Ex2 A] Mean Abs: 2.162 | Max: 6.213
[LOSS Ex2] A: 0.11639 | B: 0.29776 | C: 0.22956
** [JOINT LOSS] ** : 0.834663
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009232 | Grad Max: 0.239306
  -> Layer: shared_layers.0.bias | Grad Mean: 0.440128 | Grad Max: 1.798277
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.005403
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006408 | Grad Max: 0.006408
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003014 | Grad Max: 0.420667
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054643 | Grad Max: 2.241214
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000334 | Grad Max: 0.009176
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028420 | Grad Max: 0.127465
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000553
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005877 | Grad Max: 0.011899
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000286
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001413 | Grad Max: 0.004022
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000841 | Grad Max: 0.002615
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021207 | Grad Max: 0.021207
[GRADIENT NORM TOTAL] 9.2281

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.047
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.802384 0.197616] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.083
[MASKS] A(Pass/Fail): 751/1297 | B: 610/1246 | C: 577/1471
[LOSS Ex1] A: 0.63022 | B: 0.61518 | C: 0.61323
[LOGITS Ex2 A] Mean Abs: 2.233 | Max: 6.377
[LOSS Ex2] A: 0.10459 | B: 0.30057 | C: 0.24287
** [JOINT LOSS] ** : 0.835557
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005432 | Grad Max: 0.145806
  -> Layer: shared_layers.0.bias | Grad Mean: 0.228222 | Grad Max: 1.186232
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002120 | Grad Max: 0.005796
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000326 | Grad Max: 0.000326
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001680 | Grad Max: 0.450719
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030275 | Grad Max: 2.542480
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000169 | Grad Max: 0.006077
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014125 | Grad Max: 0.072628
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000332
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002929 | Grad Max: 0.007470
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000196
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000679 | Grad Max: 0.002267
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000424 | Grad Max: 0.001719
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010052 | Grad Max: 0.010052
[GRADIENT NORM TOTAL] 5.7096

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.164
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500144 0.499856] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.082
[MASKS] A(Pass/Fail): 723/1325 | B: 658/1390 | C: 551/1497
[LOSS Ex1] A: 0.63734 | B: 0.61486 | C: 0.61804
[LOGITS Ex2 A] Mean Abs: 2.273 | Max: 6.874
[LOSS Ex2] A: 0.09932 | B: 0.32649 | C: 0.22164
** [JOINT LOSS] ** : 0.839230
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005339 | Grad Max: 0.211509
  -> Layer: shared_layers.0.bias | Grad Mean: 0.542421 | Grad Max: 2.687800
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.005227
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002820 | Grad Max: 0.002820
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003384 | Grad Max: 0.488036
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.061306 | Grad Max: 2.701062
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000358 | Grad Max: 0.013952
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031693 | Grad Max: 0.181958
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000451
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005811 | Grad Max: 0.012057
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000257
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001437 | Grad Max: 0.004092
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000671 | Grad Max: 0.002125
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020726 | Grad Max: 0.020726
[GRADIENT NORM TOTAL] 11.6337

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.821
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.732795   0.26720503] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.083
[MASKS] A(Pass/Fail): 713/1335 | B: 645/1403 | C: 574/1474
[LOSS Ex1] A: 0.63258 | B: 0.61533 | C: 0.61097
[LOGITS Ex2 A] Mean Abs: 2.254 | Max: 6.764
[LOSS Ex2] A: 0.11830 | B: 0.32298 | C: 0.22988
** [JOINT LOSS] ** : 0.843347
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004078 | Grad Max: 0.188459
  -> Layer: shared_layers.0.bias | Grad Mean: 0.588772 | Grad Max: 2.593198
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.005783
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004448 | Grad Max: 0.004448
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003748 | Grad Max: 0.522929
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069835 | Grad Max: 2.850616
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000419 | Grad Max: 0.014427
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037669 | Grad Max: 0.211439
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000573
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007159 | Grad Max: 0.015290
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000360
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001724 | Grad Max: 0.005515
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000870 | Grad Max: 0.002282
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024827 | Grad Max: 0.024827
[GRADIENT NORM TOTAL] 12.9974

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 0.949
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6376168  0.36238316] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.083
[MASKS] A(Pass/Fail): 590/1026 | B: 648/1400 | C: 573/1475
[LOSS Ex1] A: 0.63073 | B: 0.61085 | C: 0.60540
[LOGITS Ex2 A] Mean Abs: 2.279 | Max: 9.904
[LOSS Ex2] A: 0.10638 | B: 0.29838 | C: 0.17797
** [JOINT LOSS] ** : 0.809905
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002903 | Grad Max: 0.081961
  -> Layer: shared_layers.0.bias | Grad Mean: 0.177503 | Grad Max: 0.815975
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002248 | Grad Max: 0.005863
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006974 | Grad Max: 0.006974
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001173 | Grad Max: 0.587875
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020416 | Grad Max: 3.241303
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000078 | Grad Max: 0.004467
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005761 | Grad Max: 0.045195
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000150
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000984 | Grad Max: 0.003775
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000089
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000294 | Grad Max: 0.001346
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000533 | Grad Max: 0.001369
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004814 | Grad Max: 0.004814
[GRADIENT NORM TOTAL] 5.5212

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.165
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081106  0.49188942] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 723/1325 | B: 610/1246 | C: 556/1492
[LOSS Ex1] A: 0.63146 | B: 0.61502 | C: 0.61488
[LOGITS Ex2 A] Mean Abs: 2.241 | Max: 7.543
[LOSS Ex2] A: 0.09698 | B: 0.31807 | C: 0.22791
** [JOINT LOSS] ** : 0.834770
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004962 | Grad Max: 0.192225
  -> Layer: shared_layers.0.bias | Grad Mean: 0.539744 | Grad Max: 2.630413
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.005871
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002233 | Grad Max: 0.002233
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003604 | Grad Max: 0.581572
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.067173 | Grad Max: 3.268096
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000412 | Grad Max: 0.016072
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036940 | Grad Max: 0.212451
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000589
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007100 | Grad Max: 0.014675
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000363
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001711 | Grad Max: 0.005453
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000895 | Grad Max: 0.002338
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024875 | Grad Max: 0.024875
[GRADIENT NORM TOTAL] 12.3253

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.099
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071141  0.49288583] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 717/1331 | B: 658/1390 | C: 579/1469
[LOSS Ex1] A: 0.62815 | B: 0.61471 | C: 0.60569
[LOGITS Ex2 A] Mean Abs: 2.223 | Max: 6.229
[LOSS Ex2] A: 0.10157 | B: 0.33488 | C: 0.21793
** [JOINT LOSS] ** : 0.834309
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003834 | Grad Max: 0.231708
  -> Layer: shared_layers.0.bias | Grad Mean: 0.574105 | Grad Max: 3.107951
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.006314
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002247 | Grad Max: 0.002247
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003513 | Grad Max: 0.699370
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065378 | Grad Max: 3.891796
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.013023
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033134 | Grad Max: 0.180735
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000466
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006316 | Grad Max: 0.012958
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000298
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001545 | Grad Max: 0.004501
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000882 | Grad Max: 0.002227
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023916 | Grad Max: 0.023916
[GRADIENT NORM TOTAL] 13.0849

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.132
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50874317 0.49125683] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 717/1331 | B: 645/1403 | C: 575/1473
[LOSS Ex1] A: 0.62711 | B: 0.61519 | C: 0.60731
[LOGITS Ex2 A] Mean Abs: 2.241 | Max: 8.080
[LOSS Ex2] A: 0.11990 | B: 0.31718 | C: 0.21825
** [JOINT LOSS] ** : 0.834980
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005817 | Grad Max: 0.249926
  -> Layer: shared_layers.0.bias | Grad Mean: 0.142376 | Grad Max: 0.598485
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.005986
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001871 | Grad Max: 0.001871
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001299 | Grad Max: 0.299434
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021087 | Grad Max: 1.663070
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.003700
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004021 | Grad Max: 0.028785
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000251
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000959 | Grad Max: 0.003900
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000095
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000227 | Grad Max: 0.000965
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000293 | Grad Max: 0.000935
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002118 | Grad Max: 0.002118
[GRADIENT NORM TOTAL] 4.1260

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.939
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50118804 0.4988119 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.082
[MASKS] A(Pass/Fail): 688/1360 | B: 649/1399 | C: 572/1476
[LOSS Ex1] A: 0.63600 | B: 0.61071 | C: 0.61187
[LOGITS Ex2 A] Mean Abs: 2.230 | Max: 6.056
[LOSS Ex2] A: 0.10215 | B: 0.30055 | C: 0.23359
** [JOINT LOSS] ** : 0.831623
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006685 | Grad Max: 0.220191
  -> Layer: shared_layers.0.bias | Grad Mean: 0.482031 | Grad Max: 2.836068
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.005711
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007922 | Grad Max: 0.007922
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003125 | Grad Max: 0.632989
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057884 | Grad Max: 3.535125
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.011127
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.026817 | Grad Max: 0.148970
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000437
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005238 | Grad Max: 0.011527
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000245
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001245 | Grad Max: 0.003587
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000624 | Grad Max: 0.001970
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017532 | Grad Max: 0.017532
[GRADIENT NORM TOTAL] 11.5077

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.841
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54326445 0.45673555] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.079
[MASKS] A(Pass/Fail): 688/1360 | B: 610/1246 | C: 404/972
[LOSS Ex1] A: 0.63653 | B: 0.61488 | C: 0.60757
[LOGITS Ex2 A] Mean Abs: 2.215 | Max: 6.657
[LOSS Ex2] A: 0.11360 | B: 0.30189 | C: 0.20725
** [JOINT LOSS] ** : 0.827234
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003280 | Grad Max: 0.169777
  -> Layer: shared_layers.0.bias | Grad Mean: 0.290022 | Grad Max: 1.600195
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.005221
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003304 | Grad Max: 0.003304
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001695 | Grad Max: 0.517957
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030945 | Grad Max: 2.861111
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000143 | Grad Max: 0.005271
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012547 | Grad Max: 0.079048
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000260
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002432 | Grad Max: 0.006344
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000158
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000632 | Grad Max: 0.002327
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000386 | Grad Max: 0.001473
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010206 | Grad Max: 0.010206
[GRADIENT NORM TOTAL] 7.1022

[EPOCH SUMMARY] Train Loss: 0.8318

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8154 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 155/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.052
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8032507  0.19674933] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.083
[MASKS] A(Pass/Fail): 751/1297 | B: 658/1390 | C: 572/1476
[LOSS Ex1] A: 0.63005 | B: 0.61458 | C: 0.61396
[LOGITS Ex2 A] Mean Abs: 2.229 | Max: 7.441
[LOSS Ex2] A: 0.10268 | B: 0.32564 | C: 0.22105
** [JOINT LOSS] ** : 0.835985
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003638 | Grad Max: 0.087089
  -> Layer: shared_layers.0.bias | Grad Mean: 0.292820 | Grad Max: 1.177554
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002114 | Grad Max: 0.006115
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000752 | Grad Max: 0.000752
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.254081
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038136 | Grad Max: 1.430156
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000237 | Grad Max: 0.010206
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020997 | Grad Max: 0.128432
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000360
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004195 | Grad Max: 0.009683
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000199
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001028 | Grad Max: 0.002901
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000634 | Grad Max: 0.001986
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016315 | Grad Max: 0.016315
[GRADIENT NORM TOTAL] 6.6018

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.169
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50015897 0.499841  ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.082
[MASKS] A(Pass/Fail): 723/1325 | B: 645/1403 | C: 584/1464
[LOSS Ex1] A: 0.63717 | B: 0.61505 | C: 0.60820
[LOGITS Ex2 A] Mean Abs: 2.266 | Max: 5.662
[LOSS Ex2] A: 0.09818 | B: 0.31118 | C: 0.21265
** [JOINT LOSS] ** : 0.827474
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005439 | Grad Max: 0.211160
  -> Layer: shared_layers.0.bias | Grad Mean: 0.252182 | Grad Max: 1.000299
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.005357
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002137 | Grad Max: 0.002137
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001742 | Grad Max: 0.548626
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030757 | Grad Max: 3.087435
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000165 | Grad Max: 0.006139
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013917 | Grad Max: 0.075803
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000363
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002936 | Grad Max: 0.007983
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000212
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000707 | Grad Max: 0.002773
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000439 | Grad Max: 0.002082
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011086 | Grad Max: 0.011086
[GRADIENT NORM TOTAL] 6.6097

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.824
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7333982 0.2666018] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.083
[MASKS] A(Pass/Fail): 713/1335 | B: 649/1399 | C: 533/1515
[LOSS Ex1] A: 0.63241 | B: 0.61058 | C: 0.60915
[LOGITS Ex2 A] Mean Abs: 2.262 | Max: 8.318
[LOSS Ex2] A: 0.13271 | B: 0.29580 | C: 0.21822
** [JOINT LOSS] ** : 0.832953
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004575 | Grad Max: 0.182534
  -> Layer: shared_layers.0.bias | Grad Mean: 0.426985 | Grad Max: 2.410327
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006053
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000070 | Grad Max: 0.000070
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002884 | Grad Max: 0.391845
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051904 | Grad Max: 2.172962
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.011261
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022518 | Grad Max: 0.138880
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000327
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004111 | Grad Max: 0.009119
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000196
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000994 | Grad Max: 0.002910
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000483 | Grad Max: 0.001686
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014699 | Grad Max: 0.014699
[GRADIENT NORM TOTAL] 9.6863

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 0.953
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63787    0.36212996] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.083
[MASKS] A(Pass/Fail): 590/1026 | B: 611/1245 | C: 596/1452
[LOSS Ex1] A: 0.63056 | B: 0.61475 | C: 0.60475
[LOGITS Ex2 A] Mean Abs: 2.327 | Max: 9.556
[LOSS Ex2] A: 0.09858 | B: 0.29763 | C: 0.21218
** [JOINT LOSS] ** : 0.819481
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003497 | Grad Max: 0.130499
  -> Layer: shared_layers.0.bias | Grad Mean: 0.280864 | Grad Max: 1.756748
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.005952
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008495 | Grad Max: 0.008495
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001985 | Grad Max: 0.297966
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036022 | Grad Max: 1.637293
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000193 | Grad Max: 0.008170
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017015 | Grad Max: 0.121691
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000239
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003145 | Grad Max: 0.007183
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000194
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000768 | Grad Max: 0.002654
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000387 | Grad Max: 0.001439
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010692 | Grad Max: 0.010692
[GRADIENT NORM TOTAL] 6.7581

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.171
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081099  0.49189013] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 723/1325 | B: 658/1390 | C: 580/1468
[LOSS Ex1] A: 0.63128 | B: 0.61445 | C: 0.61443
[LOGITS Ex2 A] Mean Abs: 2.281 | Max: 8.097
[LOSS Ex2] A: 0.09343 | B: 0.31853 | C: 0.22015
** [JOINT LOSS] ** : 0.830757
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003544 | Grad Max: 0.156303
  -> Layer: shared_layers.0.bias | Grad Mean: 0.390973 | Grad Max: 2.086279
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.005381
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001206 | Grad Max: 0.001206
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.768567
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041881 | Grad Max: 4.264465
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.008745
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018449 | Grad Max: 0.113282
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000299
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003534 | Grad Max: 0.007820
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000220
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000872 | Grad Max: 0.002803
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000463 | Grad Max: 0.001926
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013104 | Grad Max: 0.013104
[GRADIENT NORM TOTAL] 9.7484

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.104
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50702214 0.49297792] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 719/1329 | B: 645/1403 | C: 609/1439
[LOSS Ex1] A: 0.62797 | B: 0.61493 | C: 0.60272
[LOGITS Ex2 A] Mean Abs: 2.245 | Max: 6.932
[LOSS Ex2] A: 0.09959 | B: 0.31703 | C: 0.20947
** [JOINT LOSS] ** : 0.823902
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003249 | Grad Max: 0.147978
  -> Layer: shared_layers.0.bias | Grad Mean: 0.398789 | Grad Max: 1.955774
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002251 | Grad Max: 0.006003
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002188 | Grad Max: 0.002188
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.758862
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042626 | Grad Max: 4.192188
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.007596
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018011 | Grad Max: 0.107687
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000269
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003332 | Grad Max: 0.007098
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000191
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000819 | Grad Max: 0.002584
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000466 | Grad Max: 0.001815
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012975 | Grad Max: 0.012975
[GRADIENT NORM TOTAL] 9.5252

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.137
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50884104 0.491159  ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 717/1331 | B: 651/1397 | C: 594/1454
[LOSS Ex1] A: 0.62693 | B: 0.61045 | C: 0.61067
[LOGITS Ex2 A] Mean Abs: 2.237 | Max: 6.742
[LOSS Ex2] A: 0.11114 | B: 0.29704 | C: 0.22446
** [JOINT LOSS] ** : 0.826895
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005323 | Grad Max: 0.227302
  -> Layer: shared_layers.0.bias | Grad Mean: 0.171922 | Grad Max: 0.801632
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002257 | Grad Max: 0.006344
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002723 | Grad Max: 0.002723
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001327 | Grad Max: 0.533285
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022401 | Grad Max: 2.949156
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.004143
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010300 | Grad Max: 0.050995
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000266
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002312 | Grad Max: 0.005969
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000142
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000549 | Grad Max: 0.001874
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000392 | Grad Max: 0.001374
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006607 | Grad Max: 0.006607
[GRADIENT NORM TOTAL] 4.8634

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.943
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50117105 0.49882892] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.082
[MASKS] A(Pass/Fail): 688/1360 | B: 611/1245 | C: 597/1451
[LOSS Ex1] A: 0.63582 | B: 0.61462 | C: 0.60848
[LOGITS Ex2 A] Mean Abs: 2.217 | Max: 6.255
[LOSS Ex2] A: 0.10551 | B: 0.29877 | C: 0.21173
** [JOINT LOSS] ** : 0.824975
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002498 | Grad Max: 0.068261
  -> Layer: shared_layers.0.bias | Grad Mean: 0.080727 | Grad Max: 0.460734
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.005506
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001818 | Grad Max: 0.001818
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000695 | Grad Max: 0.144976
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011767 | Grad Max: 0.811251
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002153
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002505 | Grad Max: 0.020263
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000118
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000409 | Grad Max: 0.002541
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000076
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000100 | Grad Max: 0.000737
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000371 | Grad Max: 0.001059
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000688 | Grad Max: 0.000688
[GRADIENT NORM TOTAL] 2.4131

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.844
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5432385  0.45676142] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.079
[MASKS] A(Pass/Fail): 689/1359 | B: 658/1390 | C: 589/1459
[LOSS Ex1] A: 0.63635 | B: 0.61433 | C: 0.60903
[LOGITS Ex2 A] Mean Abs: 2.204 | Max: 6.409
[LOSS Ex2] A: 0.10842 | B: 0.30872 | C: 0.20257
** [JOINT LOSS] ** : 0.826473
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001711 | Grad Max: 0.032060
  -> Layer: shared_layers.0.bias | Grad Mean: 0.053721 | Grad Max: 0.288284
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.005680
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006946 | Grad Max: 0.006946
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000616 | Grad Max: 0.149468
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.010676 | Grad Max: 0.827307
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002571
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002525 | Grad Max: 0.027773
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000142
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000436 | Grad Max: 0.002794
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000114 | Grad Max: 0.000796
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000414 | Grad Max: 0.001212
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001647 | Grad Max: 0.001647
[GRADIENT NORM TOTAL] 2.1344

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.056
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8041074  0.19589256] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.083
[MASKS] A(Pass/Fail): 752/1296 | B: 645/1403 | C: 592/1456
[LOSS Ex1] A: 0.62986 | B: 0.61478 | C: 0.61239
[LOGITS Ex2 A] Mean Abs: 2.268 | Max: 7.599
[LOSS Ex2] A: 0.10032 | B: 0.31902 | C: 0.23545
** [JOINT LOSS] ** : 0.837274
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003864 | Grad Max: 0.167335
  -> Layer: shared_layers.0.bias | Grad Mean: 0.406388 | Grad Max: 2.038413
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006002
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004451 | Grad Max: 0.004451
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002716 | Grad Max: 0.600284
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049787 | Grad Max: 3.356731
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000250 | Grad Max: 0.008543
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022564 | Grad Max: 0.115386
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000380
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004377 | Grad Max: 0.009684
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000229
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001042 | Grad Max: 0.003499
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000490 | Grad Max: 0.001596
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014230 | Grad Max: 0.014230
[GRADIENT NORM TOTAL] 10.2043

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.174
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50019974 0.49980026] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.082
[MASKS] A(Pass/Fail): 724/1324 | B: 651/1397 | C: 573/1475
[LOSS Ex1] A: 0.63699 | B: 0.61029 | C: 0.60703
[LOGITS Ex2 A] Mean Abs: 2.241 | Max: 5.846
[LOSS Ex2] A: 0.09568 | B: 0.29660 | C: 0.19256
** [JOINT LOSS] ** : 0.813047
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005661 | Grad Max: 0.183811
  -> Layer: shared_layers.0.bias | Grad Mean: 0.184458 | Grad Max: 0.916484
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.005442
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006021 | Grad Max: 0.006021
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001336 | Grad Max: 0.421721
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021870 | Grad Max: 2.341128
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.003779
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004013 | Grad Max: 0.038516
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000162
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000515 | Grad Max: 0.003445
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000081
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000170 | Grad Max: 0.001172
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000470 | Grad Max: 0.001248
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002367 | Grad Max: 0.002367
[GRADIENT NORM TOTAL] 4.8372

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.827
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7340357  0.26596433] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.083
[MASKS] A(Pass/Fail): 713/1335 | B: 611/1245 | C: 568/1480
[LOSS Ex1] A: 0.63221 | B: 0.61445 | C: 0.61287
[LOGITS Ex2 A] Mean Abs: 2.213 | Max: 6.403
[LOSS Ex2] A: 0.12734 | B: 0.30646 | C: 0.23252
** [JOINT LOSS] ** : 0.841952
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009623 | Grad Max: 0.285598
  -> Layer: shared_layers.0.bias | Grad Mean: 0.483274 | Grad Max: 2.210505
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002060 | Grad Max: 0.005524
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002583 | Grad Max: 0.002583
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003138 | Grad Max: 0.588366
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057397 | Grad Max: 3.289949
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.011082
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030860 | Grad Max: 0.152320
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000561
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006254 | Grad Max: 0.012745
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000319
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001554 | Grad Max: 0.004715
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000950 | Grad Max: 0.002473
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024593 | Grad Max: 0.024593
[GRADIENT NORM TOTAL] 10.2337

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 0.957
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6382151  0.36178482] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 590/1026 | B: 658/1390 | C: 562/1486
[LOSS Ex1] A: 0.63035 | B: 0.61416 | C: 0.60650
[LOGITS Ex2 A] Mean Abs: 2.303 | Max: 8.997
[LOSS Ex2] A: 0.09675 | B: 0.31804 | C: 0.21903
** [JOINT LOSS] ** : 0.828275
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005641 | Grad Max: 0.155749
  -> Layer: shared_layers.0.bias | Grad Mean: 0.334207 | Grad Max: 1.974810
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002241 | Grad Max: 0.007187
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.017303 | Grad Max: 0.017303
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.651462
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040975 | Grad Max: 3.647148
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000221 | Grad Max: 0.006781
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019164 | Grad Max: 0.106671
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000388
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003853 | Grad Max: 0.008399
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000199
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000979 | Grad Max: 0.002687
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000621 | Grad Max: 0.002183
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016178 | Grad Max: 0.016178
[GRADIENT NORM TOTAL] 8.6096

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.176
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081617  0.49183828] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 723/1325 | B: 646/1402 | C: 413/963
[LOSS Ex1] A: 0.63108 | B: 0.61461 | C: 0.60652
[LOGITS Ex2 A] Mean Abs: 2.317 | Max: 6.716
[LOSS Ex2] A: 0.09537 | B: 0.31761 | C: 0.24345
** [JOINT LOSS] ** : 0.836214
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005853 | Grad Max: 0.182984
  -> Layer: shared_layers.0.bias | Grad Mean: 0.558734 | Grad Max: 2.600561
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005471
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005405 | Grad Max: 0.005405
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003612 | Grad Max: 0.601803
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.066629 | Grad Max: 3.333735
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.012915
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036443 | Grad Max: 0.192689
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000534
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007002 | Grad Max: 0.015272
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000327
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001633 | Grad Max: 0.005284
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000784 | Grad Max: 0.002151
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021495 | Grad Max: 0.021495
[GRADIENT NORM TOTAL] 12.1369

[EPOCH SUMMARY] Train Loss: 0.8290

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8149 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 156/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.109
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50688756 0.49311242] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.084
[MASKS] A(Pass/Fail): 719/1329 | B: 652/1396 | C: 556/1492
[LOSS Ex1] A: 0.62776 | B: 0.61013 | C: 0.61131
[LOGITS Ex2 A] Mean Abs: 2.332 | Max: 5.771
[LOSS Ex2] A: 0.11014 | B: 0.29576 | C: 0.23358
** [JOINT LOSS] ** : 0.829560
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008059 | Grad Max: 0.211411
  -> Layer: shared_layers.0.bias | Grad Mean: 0.600612 | Grad Max: 2.759404
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002257 | Grad Max: 0.005898
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004734 | Grad Max: 0.004734
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004055 | Grad Max: 0.538892
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074584 | Grad Max: 2.994457
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000469 | Grad Max: 0.014972
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.041661 | Grad Max: 0.230978
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000680
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008280 | Grad Max: 0.017447
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000391
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001999 | Grad Max: 0.006133
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001055 | Grad Max: 0.002552
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028719 | Grad Max: 0.028719
[GRADIENT NORM TOTAL] 13.0186

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.142
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50898594 0.49101412] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 718/1330 | B: 613/1243 | C: 619/1429
[LOSS Ex1] A: 0.62671 | B: 0.61429 | C: 0.60355
[LOGITS Ex2 A] Mean Abs: 2.261 | Max: 8.543
[LOSS Ex2] A: 0.11378 | B: 0.30129 | C: 0.20796
** [JOINT LOSS] ** : 0.822529
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005693 | Grad Max: 0.234105
  -> Layer: shared_layers.0.bias | Grad Mean: 0.261290 | Grad Max: 1.575961
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002279 | Grad Max: 0.006474
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002572 | Grad Max: 0.002572
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001862 | Grad Max: 0.532848
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031199 | Grad Max: 2.982148
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000113 | Grad Max: 0.005720
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006833 | Grad Max: 0.068661
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000194
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000783 | Grad Max: 0.003726
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000081
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000196 | Grad Max: 0.001046
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000306 | Grad Max: 0.000882
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003044 | Grad Max: 0.003044
[GRADIENT NORM TOTAL] 6.6072

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.948
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50120294 0.498797  ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.082
[MASKS] A(Pass/Fail): 688/1360 | B: 658/1390 | C: 565/1483
[LOSS Ex1] A: 0.63561 | B: 0.61401 | C: 0.60895
[LOGITS Ex2 A] Mean Abs: 2.216 | Max: 5.806
[LOSS Ex2] A: 0.09981 | B: 0.32937 | C: 0.21710
** [JOINT LOSS] ** : 0.834953
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005126 | Grad Max: 0.251061
  -> Layer: shared_layers.0.bias | Grad Mean: 0.650604 | Grad Max: 3.430507
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.005659
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006997 | Grad Max: 0.006997
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004276 | Grad Max: 0.868649
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.080192 | Grad Max: 4.801690
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000449 | Grad Max: 0.016006
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.040638 | Grad Max: 0.223652
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000536
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007676 | Grad Max: 0.015107
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000358
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001873 | Grad Max: 0.005591
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001035 | Grad Max: 0.002805
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028613 | Grad Max: 0.028613
[GRADIENT NORM TOTAL] 15.6169

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.847
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5431735  0.45682657] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.079
[MASKS] A(Pass/Fail): 689/1359 | B: 646/1402 | C: 600/1448
[LOSS Ex1] A: 0.63616 | B: 0.61447 | C: 0.60528
[LOGITS Ex2 A] Mean Abs: 2.239 | Max: 6.160
[LOSS Ex2] A: 0.10985 | B: 0.31334 | C: 0.21601
** [JOINT LOSS] ** : 0.831705
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003461 | Grad Max: 0.073710
  -> Layer: shared_layers.0.bias | Grad Mean: 0.259709 | Grad Max: 1.091612
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.005459
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004997 | Grad Max: 0.004997
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001721 | Grad Max: 0.658223
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031618 | Grad Max: 3.658653
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000173 | Grad Max: 0.005592
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015073 | Grad Max: 0.081881
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000272
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002934 | Grad Max: 0.006887
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000172
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000706 | Grad Max: 0.002315
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000386 | Grad Max: 0.001392
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010412 | Grad Max: 0.010412
[GRADIENT NORM TOTAL] 6.7809

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.061
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8050433  0.19495672] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.084
[MASKS] A(Pass/Fail): 752/1296 | B: 653/1395 | C: 604/1444
[LOSS Ex1] A: 0.62967 | B: 0.60999 | C: 0.60674
[LOGITS Ex2 A] Mean Abs: 2.303 | Max: 7.012
[LOSS Ex2] A: 0.09769 | B: 0.29205 | C: 0.19993
** [JOINT LOSS] ** : 0.812024
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007354 | Grad Max: 0.218204
  -> Layer: shared_layers.0.bias | Grad Mean: 0.543717 | Grad Max: 2.991340
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.005911
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003282 | Grad Max: 0.003282
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003494 | Grad Max: 0.578011
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064576 | Grad Max: 3.244424
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.012707
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.032545 | Grad Max: 0.176336
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000501
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006366 | Grad Max: 0.013386
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000290
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001520 | Grad Max: 0.004555
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000787 | Grad Max: 0.002397
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021948 | Grad Max: 0.021948
[GRADIENT NORM TOTAL] 12.3653

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.179
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002573  0.49974266] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.082
[MASKS] A(Pass/Fail): 724/1324 | B: 613/1243 | C: 602/1446
[LOSS Ex1] A: 0.63681 | B: 0.61415 | C: 0.60764
[LOGITS Ex2 A] Mean Abs: 2.327 | Max: 6.385
[LOSS Ex2] A: 0.10124 | B: 0.29914 | C: 0.21568
** [JOINT LOSS] ** : 0.824888
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005474 | Grad Max: 0.218200
  -> Layer: shared_layers.0.bias | Grad Mean: 0.535448 | Grad Max: 2.851196
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.005341
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000926 | Grad Max: 0.000926
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003438 | Grad Max: 0.676840
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.063721 | Grad Max: 3.781099
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000358 | Grad Max: 0.012076
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031949 | Grad Max: 0.174672
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000431
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005978 | Grad Max: 0.012167
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000301
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001443 | Grad Max: 0.004053
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000720 | Grad Max: 0.002171
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020681 | Grad Max: 0.020681
[GRADIENT NORM TOTAL] 12.4996

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.831
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.73459584 0.2654041 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.084
[MASKS] A(Pass/Fail): 713/1335 | B: 659/1389 | C: 568/1480
[LOSS Ex1] A: 0.63203 | B: 0.61388 | C: 0.61028
[LOGITS Ex2 A] Mean Abs: 2.250 | Max: 6.840
[LOSS Ex2] A: 0.10958 | B: 0.32030 | C: 0.21198
** [JOINT LOSS] ** : 0.832684
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003921 | Grad Max: 0.173641
  -> Layer: shared_layers.0.bias | Grad Mean: 0.135613 | Grad Max: 1.425563
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006072
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001867 | Grad Max: 0.001867
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001104 | Grad Max: 0.285587
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018190 | Grad Max: 1.577303
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000059 | Grad Max: 0.003269
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002647 | Grad Max: 0.024926
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000200
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000356 | Grad Max: 0.002118
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000073
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000098 | Grad Max: 0.000500
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000286 | Grad Max: 0.000986
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001312 | Grad Max: 0.001312
[GRADIENT NORM TOTAL] 3.9748

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6384724  0.36152765] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 590/1026 | B: 646/1402 | C: 570/1478
[LOSS Ex1] A: 0.63017 | B: 0.61434 | C: 0.60803
[LOGITS Ex2 A] Mean Abs: 2.293 | Max: 7.998
[LOSS Ex2] A: 0.10680 | B: 0.32315 | C: 0.22808
** [JOINT LOSS] ** : 0.836857
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006226 | Grad Max: 0.178059
  -> Layer: shared_layers.0.bias | Grad Mean: 0.523794 | Grad Max: 2.258018
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.006319
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011600 | Grad Max: 0.011600
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003439 | Grad Max: 0.379696
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.063211 | Grad Max: 2.142601
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.015163
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036239 | Grad Max: 0.199207
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000648
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007199 | Grad Max: 0.014787
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000326
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001805 | Grad Max: 0.004996
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001037 | Grad Max: 0.002579
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028568 | Grad Max: 0.028568
[GRADIENT NORM TOTAL] 10.8873

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.180
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081915 0.4918085] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 723/1325 | B: 653/1395 | C: 595/1453
[LOSS Ex1] A: 0.63090 | B: 0.60986 | C: 0.61104
[LOGITS Ex2 A] Mean Abs: 2.268 | Max: 10.175
[LOSS Ex2] A: 0.09636 | B: 0.29571 | C: 0.21892
** [JOINT LOSS] ** : 0.820934
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002408 | Grad Max: 0.090016
  -> Layer: shared_layers.0.bias | Grad Mean: 0.245006 | Grad Max: 1.189641
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.005180
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000566 | Grad Max: 0.000566
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001543 | Grad Max: 0.368607
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028031 | Grad Max: 2.042300
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.006671
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015296 | Grad Max: 0.090306
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000289
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002854 | Grad Max: 0.007775
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000178
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000675 | Grad Max: 0.002484
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000388 | Grad Max: 0.001676
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009969 | Grad Max: 0.009969
[GRADIENT NORM TOTAL] 5.4795

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.113
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068304 0.4931696] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 720/1328 | B: 613/1243 | C: 584/1464
[LOSS Ex1] A: 0.62758 | B: 0.61402 | C: 0.60386
[LOGITS Ex2 A] Mean Abs: 2.316 | Max: 6.349
[LOSS Ex2] A: 0.11378 | B: 0.30906 | C: 0.21510
** [JOINT LOSS] ** : 0.827801
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008334 | Grad Max: 0.219607
  -> Layer: shared_layers.0.bias | Grad Mean: 0.561810 | Grad Max: 2.677314
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002308 | Grad Max: 0.006780
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008213 | Grad Max: 0.008213
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003760 | Grad Max: 0.477254
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069635 | Grad Max: 2.656878
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000411 | Grad Max: 0.014030
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036673 | Grad Max: 0.200695
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000539
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007236 | Grad Max: 0.015377
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000348
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001763 | Grad Max: 0.005096
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000955 | Grad Max: 0.002334
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026556 | Grad Max: 0.026556
[GRADIENT NORM TOTAL] 12.2492

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.147
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.509067   0.49093294] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.084
[MASKS] A(Pass/Fail): 718/1330 | B: 659/1389 | C: 613/1435
[LOSS Ex1] A: 0.62654 | B: 0.61376 | C: 0.60871
[LOGITS Ex2 A] Mean Abs: 2.290 | Max: 8.077
[LOSS Ex2] A: 0.12157 | B: 0.32530 | C: 0.22157
** [JOINT LOSS] ** : 0.839150
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011774 | Grad Max: 0.323701
  -> Layer: shared_layers.0.bias | Grad Mean: 0.655926 | Grad Max: 2.661777
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.006501
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000142 | Grad Max: 0.000142
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004558 | Grad Max: 0.545714
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.083343 | Grad Max: 3.035133
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000515 | Grad Max: 0.015925
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.045059 | Grad Max: 0.232359
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000776
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009091 | Grad Max: 0.019596
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000467
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002190 | Grad Max: 0.007352
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001139 | Grad Max: 0.002879
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031087 | Grad Max: 0.031087
[GRADIENT NORM TOTAL] 13.8341

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.951
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5011738  0.49882624] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.082
[MASKS] A(Pass/Fail): 688/1360 | B: 646/1402 | C: 601/1447
[LOSS Ex1] A: 0.63545 | B: 0.61422 | C: 0.60891
[LOGITS Ex2 A] Mean Abs: 2.216 | Max: 6.282
[LOSS Ex2] A: 0.09948 | B: 0.30808 | C: 0.22110
** [JOINT LOSS] ** : 0.829082
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002644 | Grad Max: 0.082241
  -> Layer: shared_layers.0.bias | Grad Mean: 0.101058 | Grad Max: 0.623551
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.005238
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004219 | Grad Max: 0.004219
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000850 | Grad Max: 0.182180
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014306 | Grad Max: 0.989350
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003951
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003280 | Grad Max: 0.032803
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000154
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000482 | Grad Max: 0.002517
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000062
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000108 | Grad Max: 0.000605
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000372 | Grad Max: 0.001099
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002029 | Grad Max: 0.002029
[GRADIENT NORM TOTAL] 3.0079

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.850
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5431843  0.45681572] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.079
[MASKS] A(Pass/Fail): 689/1359 | B: 653/1395 | C: 550/1498
[LOSS Ex1] A: 0.63600 | B: 0.60973 | C: 0.61148
[LOGITS Ex2 A] Mean Abs: 2.186 | Max: 5.419
[LOSS Ex2] A: 0.11032 | B: 0.29243 | C: 0.23453
** [JOINT LOSS] ** : 0.831497
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005720 | Grad Max: 0.212417
  -> Layer: shared_layers.0.bias | Grad Mean: 0.525648 | Grad Max: 2.686779
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.006293
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013398 | Grad Max: 0.013398
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003164 | Grad Max: 0.862336
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058702 | Grad Max: 4.769650
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000321 | Grad Max: 0.011005
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028937 | Grad Max: 0.161332
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000430
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005703 | Grad Max: 0.011032
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000313
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001421 | Grad Max: 0.004412
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000796 | Grad Max: 0.002438
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021910 | Grad Max: 0.021910
[GRADIENT NORM TOTAL] 12.5446

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.065
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.80583656 0.19416343] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.084
[MASKS] A(Pass/Fail): 752/1296 | B: 613/1243 | C: 357/1019
[LOSS Ex1] A: 0.62951 | B: 0.61390 | C: 0.61776
[LOGITS Ex2 A] Mean Abs: 2.230 | Max: 7.805
[LOSS Ex2] A: 0.09421 | B: 0.29981 | C: 0.26375
** [JOINT LOSS] ** : 0.839641
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003555 | Grad Max: 0.187002
  -> Layer: shared_layers.0.bias | Grad Mean: 0.379988 | Grad Max: 2.088233
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.005275
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001313 | Grad Max: 0.001313
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002154 | Grad Max: 0.634654
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039515 | Grad Max: 3.491164
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000215 | Grad Max: 0.009975
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019837 | Grad Max: 0.117367
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000324
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003896 | Grad Max: 0.008542
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000211
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001011 | Grad Max: 0.003058
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000572 | Grad Max: 0.001692
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016790 | Grad Max: 0.016790
[GRADIENT NORM TOTAL] 8.6935

[EPOCH SUMMARY] Train Loss: 0.8295

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8127 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8149 -> New: 0.8127)

############################## EPOCH 157/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.183
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002708 0.4997292] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.083
[MASKS] A(Pass/Fail): 724/1324 | B: 659/1389 | C: 586/1462
[LOSS Ex1] A: 0.63665 | B: 0.61364 | C: 0.60593
[LOGITS Ex2 A] Mean Abs: 2.286 | Max: 7.116
[LOSS Ex2] A: 0.08777 | B: 0.31969 | C: 0.21251
** [JOINT LOSS] ** : 0.825399
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003548 | Grad Max: 0.140361
  -> Layer: shared_layers.0.bias | Grad Mean: 0.383234 | Grad Max: 1.779468
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005256
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002983 | Grad Max: 0.002983
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002660 | Grad Max: 0.368186
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048400 | Grad Max: 2.041661
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.010165
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024782 | Grad Max: 0.141471
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000381
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004527 | Grad Max: 0.009597
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000210
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001101 | Grad Max: 0.003207
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000515 | Grad Max: 0.001836
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015978 | Grad Max: 0.015978
[GRADIENT NORM TOTAL] 8.6718

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.834
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.73514456 0.26485547] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.084
[MASKS] A(Pass/Fail): 714/1334 | B: 646/1402 | C: 584/1464
[LOSS Ex1] A: 0.63186 | B: 0.61409 | C: 0.60506
[LOGITS Ex2 A] Mean Abs: 2.243 | Max: 6.947
[LOSS Ex2] A: 0.12214 | B: 0.31729 | C: 0.22134
** [JOINT LOSS] ** : 0.837260
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004434 | Grad Max: 0.172871
  -> Layer: shared_layers.0.bias | Grad Mean: 0.362127 | Grad Max: 2.199117
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.006012
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010368 | Grad Max: 0.010368
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002413 | Grad Max: 0.432402
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043339 | Grad Max: 2.403100
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000208 | Grad Max: 0.010333
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018629 | Grad Max: 0.114446
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000283
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003411 | Grad Max: 0.007866
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000147
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000812 | Grad Max: 0.002231
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000370 | Grad Max: 0.001445
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011374 | Grad Max: 0.011374
[GRADIENT NORM TOTAL] 8.5838

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 0.965
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6387481 0.3612519] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.084
[MASKS] A(Pass/Fail): 590/1026 | B: 653/1395 | C: 558/1490
[LOSS Ex1] A: 0.63000 | B: 0.60961 | C: 0.61023
[LOGITS Ex2 A] Mean Abs: 2.246 | Max: 8.015
[LOSS Ex2] A: 0.10292 | B: 0.29444 | C: 0.23253
** [JOINT LOSS] ** : 0.826576
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004841 | Grad Max: 0.144939
  -> Layer: shared_layers.0.bias | Grad Mean: 0.175856 | Grad Max: 1.195758
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.005732
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008023 | Grad Max: 0.008023
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001320 | Grad Max: 0.168387
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023308 | Grad Max: 0.929998
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000137 | Grad Max: 0.006574
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011883 | Grad Max: 0.073321
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000277
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002488 | Grad Max: 0.006227
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000174
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000636 | Grad Max: 0.002288
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000402 | Grad Max: 0.001633
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010931 | Grad Max: 0.010931
[GRADIENT NORM TOTAL] 4.0239

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.185
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081894  0.49181062] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 723/1325 | B: 613/1243 | C: 596/1452
[LOSS Ex1] A: 0.63073 | B: 0.61377 | C: 0.60632
[LOGITS Ex2 A] Mean Abs: 2.250 | Max: 8.729
[LOSS Ex2] A: 0.09933 | B: 0.30300 | C: 0.21349
** [JOINT LOSS] ** : 0.822211
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002797 | Grad Max: 0.094142
  -> Layer: shared_layers.0.bias | Grad Mean: 0.102614 | Grad Max: 0.370398
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.006106
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006131 | Grad Max: 0.006131
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000713 | Grad Max: 0.190896
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011995 | Grad Max: 1.049854
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.002305
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002162 | Grad Max: 0.019623
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000132
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000383 | Grad Max: 0.002813
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000066
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000100 | Grad Max: 0.000552
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000372 | Grad Max: 0.001076
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000884 | Grad Max: 0.000884
[GRADIENT NORM TOTAL] 2.7491

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.117
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50675005 0.4932499 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 720/1328 | B: 659/1389 | C: 584/1464
[LOSS Ex1] A: 0.62740 | B: 0.61351 | C: 0.60871
[LOGITS Ex2 A] Mean Abs: 2.276 | Max: 6.059
[LOSS Ex2] A: 0.10216 | B: 0.31654 | C: 0.20476
** [JOINT LOSS] ** : 0.824360
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003986 | Grad Max: 0.119048
  -> Layer: shared_layers.0.bias | Grad Mean: 0.169245 | Grad Max: 1.186606
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.005931
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000338 | Grad Max: 0.000338
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001215 | Grad Max: 0.335827
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021688 | Grad Max: 1.839101
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.004651
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010833 | Grad Max: 0.064112
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000256
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002211 | Grad Max: 0.005904
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000121
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000536 | Grad Max: 0.001595
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000364 | Grad Max: 0.001300
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007469 | Grad Max: 0.007469
[GRADIENT NORM TOTAL] 4.2067

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.151
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5091447  0.49085534] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.564 | Std: 0.084
[MASKS] A(Pass/Fail): 721/1327 | B: 646/1402 | C: 643/1405
[LOSS Ex1] A: 0.62635 | B: 0.61395 | C: 0.60533
[LOGITS Ex2 A] Mean Abs: 2.218 | Max: 7.452
[LOSS Ex2] A: 0.11976 | B: 0.31604 | C: 0.21367
** [JOINT LOSS] ** : 0.831704
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003641 | Grad Max: 0.125250
  -> Layer: shared_layers.0.bias | Grad Mean: 0.282281 | Grad Max: 1.681771
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.006529
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001399 | Grad Max: 0.001399
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001846 | Grad Max: 0.514216
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033046 | Grad Max: 2.821452
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000152 | Grad Max: 0.006820
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013512 | Grad Max: 0.096123
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000218
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002339 | Grad Max: 0.006260
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000130
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000555 | Grad Max: 0.001826
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000323 | Grad Max: 0.001320
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008709 | Grad Max: 0.008709
[GRADIENT NORM TOTAL] 7.0020

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.955
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50114894 0.4988511 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.561 | Std: 0.083
[MASKS] A(Pass/Fail): 688/1360 | B: 653/1395 | C: 564/1484
[LOSS Ex1] A: 0.63526 | B: 0.60945 | C: 0.60839
[LOGITS Ex2 A] Mean Abs: 2.197 | Max: 5.816
[LOSS Ex2] A: 0.09685 | B: 0.28676 | C: 0.22880
** [JOINT LOSS] ** : 0.821838
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003316 | Grad Max: 0.091841
  -> Layer: shared_layers.0.bias | Grad Mean: 0.169468 | Grad Max: 0.851953
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005509
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001600 | Grad Max: 0.001600
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001138 | Grad Max: 0.443816
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019673 | Grad Max: 2.466603
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.004295
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005639 | Grad Max: 0.046831
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000181
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000935 | Grad Max: 0.004192
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000078
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000211 | Grad Max: 0.000829
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000423 | Grad Max: 0.001189
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003008 | Grad Max: 0.003008
[GRADIENT NORM TOTAL] 4.7809

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.854
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5431174  0.45688257] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.080
[MASKS] A(Pass/Fail): 691/1357 | B: 613/1243 | C: 598/1450
[LOSS Ex1] A: 0.63582 | B: 0.61360 | C: 0.60975
[LOGITS Ex2 A] Mean Abs: 2.213 | Max: 6.007
[LOSS Ex2] A: 0.10423 | B: 0.29529 | C: 0.21326
** [JOINT LOSS] ** : 0.823986
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003320 | Grad Max: 0.118124
  -> Layer: shared_layers.0.bias | Grad Mean: 0.238981 | Grad Max: 1.072875
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.005439
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002876 | Grad Max: 0.002876
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.293769
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026104 | Grad Max: 1.603777
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.006414
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011597 | Grad Max: 0.074806
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000240
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002261 | Grad Max: 0.006043
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000131
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000596 | Grad Max: 0.001918
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000399 | Grad Max: 0.001551
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009142 | Grad Max: 0.009142
[GRADIENT NORM TOTAL] 5.4004

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.069
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.806801   0.19319896] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.084
[MASKS] A(Pass/Fail): 752/1296 | B: 659/1389 | C: 572/1476
[LOSS Ex1] A: 0.62932 | B: 0.61334 | C: 0.61028
[LOGITS Ex2 A] Mean Abs: 2.226 | Max: 6.437
[LOSS Ex2] A: 0.10542 | B: 0.32074 | C: 0.21449
** [JOINT LOSS] ** : 0.831196
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002352 | Grad Max: 0.069171
  -> Layer: shared_layers.0.bias | Grad Mean: 0.143298 | Grad Max: 0.944270
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.005597
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000119 | Grad Max: 0.000119
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000951 | Grad Max: 0.234409
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016712 | Grad Max: 1.302522
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.004535
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005027 | Grad Max: 0.042310
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000175
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000908 | Grad Max: 0.003686
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000067
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000202 | Grad Max: 0.000907
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.000969
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002029 | Grad Max: 0.002029
[GRADIENT NORM TOTAL] 3.5866

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.188
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50026906 0.4997309 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.083
[MASKS] A(Pass/Fail): 724/1324 | B: 646/1402 | C: 565/1483
[LOSS Ex1] A: 0.63647 | B: 0.61378 | C: 0.60658
[LOGITS Ex2 A] Mean Abs: 2.221 | Max: 6.186
[LOSS Ex2] A: 0.08588 | B: 0.31946 | C: 0.21392
** [JOINT LOSS] ** : 0.825362
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005208 | Grad Max: 0.135971
  -> Layer: shared_layers.0.bias | Grad Mean: 0.387676 | Grad Max: 1.883190
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.005237
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000783 | Grad Max: 0.000783
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002561 | Grad Max: 0.765227
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046841 | Grad Max: 4.253297
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000269 | Grad Max: 0.008995
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023945 | Grad Max: 0.124429
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000449
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004771 | Grad Max: 0.011682
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000263
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001133 | Grad Max: 0.003856
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000626 | Grad Max: 0.002614
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016978 | Grad Max: 0.016978
[GRADIENT NORM TOTAL] 9.6201

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.838
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.735948   0.26405194] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.084
[MASKS] A(Pass/Fail): 714/1334 | B: 653/1395 | C: 574/1474
[LOSS Ex1] A: 0.63166 | B: 0.60928 | C: 0.60731
[LOGITS Ex2 A] Mean Abs: 2.215 | Max: 7.002
[LOSS Ex2] A: 0.11554 | B: 0.29528 | C: 0.23121
** [JOINT LOSS] ** : 0.830090
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006661 | Grad Max: 0.175301
  -> Layer: shared_layers.0.bias | Grad Mean: 0.302390 | Grad Max: 1.223943
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002142 | Grad Max: 0.005913
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004426 | Grad Max: 0.004426
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.724845
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035436 | Grad Max: 4.006245
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000232 | Grad Max: 0.008081
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019846 | Grad Max: 0.103750
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000393
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003976 | Grad Max: 0.009347
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000238
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000956 | Grad Max: 0.003162
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000562 | Grad Max: 0.002047
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014270 | Grad Max: 0.014270
[GRADIENT NORM TOTAL] 7.3235

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.970
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6391472 0.3608528] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.084
[MASKS] A(Pass/Fail): 590/1026 | B: 613/1243 | C: 612/1436
[LOSS Ex1] A: 0.62979 | B: 0.61342 | C: 0.60841
[LOGITS Ex2 A] Mean Abs: 2.299 | Max: 11.122
[LOSS Ex2] A: 0.10873 | B: 0.29584 | C: 0.21830
** [JOINT LOSS] ** : 0.824829
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003752 | Grad Max: 0.221480
  -> Layer: shared_layers.0.bias | Grad Mean: 0.444011 | Grad Max: 2.916276
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.006012
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002571 | Grad Max: 0.002571
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002843 | Grad Max: 0.493130
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052173 | Grad Max: 2.703733
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000282 | Grad Max: 0.011074
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025832 | Grad Max: 0.148893
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000382
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004822 | Grad Max: 0.010703
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000199
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001160 | Grad Max: 0.003139
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000538 | Grad Max: 0.001970
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016630 | Grad Max: 0.016630
[GRADIENT NORM TOTAL] 10.3892

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.190
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50814945 0.49185058] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 723/1325 | B: 659/1389 | C: 593/1455
[LOSS Ex1] A: 0.63052 | B: 0.61318 | C: 0.61043
[LOGITS Ex2 A] Mean Abs: 2.296 | Max: 10.431
[LOSS Ex2] A: 0.09531 | B: 0.31850 | C: 0.22236
** [JOINT LOSS] ** : 0.830102
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004413 | Grad Max: 0.213261
  -> Layer: shared_layers.0.bias | Grad Mean: 0.551957 | Grad Max: 2.872541
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.005511
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001245 | Grad Max: 0.001245
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003729 | Grad Max: 0.557624
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069232 | Grad Max: 3.092756
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.014267
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036689 | Grad Max: 0.206255
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000551
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007050 | Grad Max: 0.014467
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000311
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001708 | Grad Max: 0.004858
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000793 | Grad Max: 0.002264
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024078 | Grad Max: 0.024078
[GRADIENT NORM TOTAL] 12.7830

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.123
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067339 0.4932661] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 720/1328 | B: 646/1402 | C: 379/997
[LOSS Ex1] A: 0.62719 | B: 0.61362 | C: 0.61245
[LOGITS Ex2 A] Mean Abs: 2.247 | Max: 6.263
[LOSS Ex2] A: 0.10232 | B: 0.30991 | C: 0.22495
** [JOINT LOSS] ** : 0.830150
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002447 | Grad Max: 0.045752
  -> Layer: shared_layers.0.bias | Grad Mean: 0.134412 | Grad Max: 0.557344
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005698
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002716 | Grad Max: 0.002716
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000914 | Grad Max: 0.413251
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016031 | Grad Max: 2.313596
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.003904
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002599 | Grad Max: 0.023597
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000130
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000451 | Grad Max: 0.002933
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000117 | Grad Max: 0.000634
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000353 | Grad Max: 0.001229
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001782 | Grad Max: 0.001782
[GRADIENT NORM TOTAL] 4.1409

[EPOCH SUMMARY] Train Loss: 0.8275

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8114 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8127 -> New: 0.8114)

############################## EPOCH 158/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.157
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50914294 0.4908571 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 721/1327 | B: 653/1395 | C: 588/1460
[LOSS Ex1] A: 0.62615 | B: 0.60912 | C: 0.60741
[LOGITS Ex2 A] Mean Abs: 2.211 | Max: 7.056
[LOSS Ex2] A: 0.10545 | B: 0.29204 | C: 0.22381
** [JOINT LOSS] ** : 0.821325
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003771 | Grad Max: 0.151879
  -> Layer: shared_layers.0.bias | Grad Mean: 0.400645 | Grad Max: 1.956825
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.005781
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002073 | Grad Max: 0.002073
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002361 | Grad Max: 0.439429
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042937 | Grad Max: 2.398978
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000233 | Grad Max: 0.009973
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020880 | Grad Max: 0.135823
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000344
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003888 | Grad Max: 0.008899
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000979 | Grad Max: 0.003428
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000535 | Grad Max: 0.002000
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015526 | Grad Max: 0.015526
[GRADIENT NORM TOTAL] 8.4697

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.959
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50102895 0.49897102] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.083
[MASKS] A(Pass/Fail): 688/1360 | B: 613/1243 | C: 568/1480
[LOSS Ex1] A: 0.63507 | B: 0.61326 | C: 0.61060
[LOGITS Ex2 A] Mean Abs: 2.215 | Max: 7.109
[LOSS Ex2] A: 0.10196 | B: 0.30206 | C: 0.20442
** [JOINT LOSS] ** : 0.822457
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002495 | Grad Max: 0.084145
  -> Layer: shared_layers.0.bias | Grad Mean: 0.267379 | Grad Max: 1.201651
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002076 | Grad Max: 0.005610
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000123 | Grad Max: 0.000123
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001658 | Grad Max: 0.260653
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030078 | Grad Max: 1.447896
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.006663
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015346 | Grad Max: 0.084945
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000303
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002935 | Grad Max: 0.007499
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000170
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000673 | Grad Max: 0.002570
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000409 | Grad Max: 0.001732
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009611 | Grad Max: 0.009611
[GRADIENT NORM TOTAL] 5.5780

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.858
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5431274 0.4568726] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.080
[MASKS] A(Pass/Fail): 691/1357 | B: 659/1389 | C: 618/1430
[LOSS Ex1] A: 0.63564 | B: 0.61303 | C: 0.60530
[LOGITS Ex2 A] Mean Abs: 2.244 | Max: 6.291
[LOSS Ex2] A: 0.11414 | B: 0.31280 | C: 0.22304
** [JOINT LOSS] ** : 0.834648
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005289 | Grad Max: 0.214770
  -> Layer: shared_layers.0.bias | Grad Mean: 0.488552 | Grad Max: 2.295689
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.005465
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007206 | Grad Max: 0.007206
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003130 | Grad Max: 0.654056
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057905 | Grad Max: 3.645379
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000320 | Grad Max: 0.011657
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029062 | Grad Max: 0.165547
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000452
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005711 | Grad Max: 0.012382
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000265
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001379 | Grad Max: 0.004056
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000666 | Grad Max: 0.002162
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019393 | Grad Max: 0.019393
[GRADIENT NORM TOTAL] 11.1332

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.075
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8077895  0.19221045] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.084
[MASKS] A(Pass/Fail): 751/1297 | B: 646/1402 | C: 579/1469
[LOSS Ex1] A: 0.62912 | B: 0.61347 | C: 0.60858
[LOGITS Ex2 A] Mean Abs: 2.281 | Max: 6.782
[LOSS Ex2] A: 0.10444 | B: 0.31587 | C: 0.23947
** [JOINT LOSS] ** : 0.836984
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007527 | Grad Max: 0.251714
  -> Layer: shared_layers.0.bias | Grad Mean: 0.641844 | Grad Max: 3.093090
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.006239
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001708 | Grad Max: 0.001708
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004001 | Grad Max: 0.733402
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074255 | Grad Max: 4.082465
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.015104
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039133 | Grad Max: 0.220290
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000574
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007632 | Grad Max: 0.015123
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000368
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001882 | Grad Max: 0.005972
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000933 | Grad Max: 0.002419
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027361 | Grad Max: 0.027361
[GRADIENT NORM TOTAL] 14.4293

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.194
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002435 0.4997565] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 724/1324 | B: 653/1395 | C: 557/1491
[LOSS Ex1] A: 0.63628 | B: 0.60897 | C: 0.61141
[LOGITS Ex2 A] Mean Abs: 2.278 | Max: 6.099
[LOSS Ex2] A: 0.09536 | B: 0.29520 | C: 0.21655
** [JOINT LOSS] ** : 0.821258
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002754 | Grad Max: 0.058498
  -> Layer: shared_layers.0.bias | Grad Mean: 0.176598 | Grad Max: 0.696940
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005120
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000873 | Grad Max: 0.000873
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001191 | Grad Max: 0.252388
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.021617 | Grad Max: 1.394597
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.004476
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008277 | Grad Max: 0.051339
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000198
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001449 | Grad Max: 0.004702
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000099
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000355 | Grad Max: 0.001261
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000390 | Grad Max: 0.001132
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004502 | Grad Max: 0.004502
[GRADIENT NORM TOTAL] 4.3870

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.842
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.73661226 0.2633877 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.084
[MASKS] A(Pass/Fail): 714/1334 | B: 613/1243 | C: 604/1444
[LOSS Ex1] A: 0.63146 | B: 0.61312 | C: 0.60376
[LOGITS Ex2 A] Mean Abs: 2.211 | Max: 6.612
[LOSS Ex2] A: 0.11359 | B: 0.31837 | C: 0.24034
** [JOINT LOSS] ** : 0.840213
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009207 | Grad Max: 0.272457
  -> Layer: shared_layers.0.bias | Grad Mean: 0.815414 | Grad Max: 3.618618
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.005526
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003296 | Grad Max: 0.003296
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005435 | Grad Max: 0.604844
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.101298 | Grad Max: 3.349294
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000618 | Grad Max: 0.020649
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.056483 | Grad Max: 0.305221
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000804
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010939 | Grad Max: 0.020917
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000507
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002638 | Grad Max: 0.008173
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001417 | Grad Max: 0.003328
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039494 | Grad Max: 0.039494
[GRADIENT NORM TOTAL] 17.4737

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.974
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63945895 0.36054105] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.084
[MASKS] A(Pass/Fail): 590/1026 | B: 659/1389 | C: 586/1462
[LOSS Ex1] A: 0.62960 | B: 0.61290 | C: 0.60443
[LOGITS Ex2 A] Mean Abs: 2.245 | Max: 8.402
[LOSS Ex2] A: 0.10210 | B: 0.34827 | C: 0.20398
** [JOINT LOSS] ** : 0.833762
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011678 | Grad Max: 0.337708
  -> Layer: shared_layers.0.bias | Grad Mean: 0.955025 | Grad Max: 4.566785
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.006261
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006318 | Grad Max: 0.006318
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006428 | Grad Max: 1.104421
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.119485 | Grad Max: 6.121582
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000692 | Grad Max: 0.022184
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.063123 | Grad Max: 0.332739
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.000838
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012291 | Grad Max: 0.024131
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000614
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002954 | Grad Max: 0.009129
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001546 | Grad Max: 0.003932
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043277 | Grad Max: 0.043277
[GRADIENT NORM TOTAL] 21.8891

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.196
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081215 0.4918785] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 723/1325 | B: 646/1402 | C: 590/1458
[LOSS Ex1] A: 0.63033 | B: 0.61334 | C: 0.60682
[LOGITS Ex2 A] Mean Abs: 2.263 | Max: 7.230
[LOSS Ex2] A: 0.09322 | B: 0.31631 | C: 0.22672
** [JOINT LOSS] ** : 0.828916
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004649 | Grad Max: 0.187035
  -> Layer: shared_layers.0.bias | Grad Mean: 0.415648 | Grad Max: 2.470083
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.005208
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003646 | Grad Max: 0.003646
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002643 | Grad Max: 0.802328
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049127 | Grad Max: 4.443995
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000255 | Grad Max: 0.009740
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023180 | Grad Max: 0.136063
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000365
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004383 | Grad Max: 0.009321
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000191
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001053 | Grad Max: 0.002979
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000598 | Grad Max: 0.002050
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016386 | Grad Max: 0.016386
[GRADIENT NORM TOTAL] 10.4621

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.127
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5066513  0.49334872] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 720/1328 | B: 654/1394 | C: 606/1442
[LOSS Ex1] A: 0.62700 | B: 0.60884 | C: 0.60829
[LOGITS Ex2 A] Mean Abs: 2.310 | Max: 6.360
[LOSS Ex2] A: 0.10348 | B: 0.30700 | C: 0.21884
** [JOINT LOSS] ** : 0.824483
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007352 | Grad Max: 0.236437
  -> Layer: shared_layers.0.bias | Grad Mean: 0.638477 | Grad Max: 2.993062
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002256 | Grad Max: 0.005853
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003998 | Grad Max: 0.003998
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004274 | Grad Max: 0.512546
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.079345 | Grad Max: 2.855659
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000498 | Grad Max: 0.016363
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.045366 | Grad Max: 0.231990
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000707
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008836 | Grad Max: 0.018571
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000411
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002138 | Grad Max: 0.006699
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001069 | Grad Max: 0.002918
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030740 | Grad Max: 0.030740
[GRADIENT NORM TOTAL] 13.7294

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.161
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.509246   0.49075398] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 721/1327 | B: 613/1243 | C: 586/1462
[LOSS Ex1] A: 0.62596 | B: 0.61299 | C: 0.60559
[LOGITS Ex2 A] Mean Abs: 2.317 | Max: 7.271
[LOSS Ex2] A: 0.12116 | B: 0.31034 | C: 0.21593
** [JOINT LOSS] ** : 0.830661
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012962 | Grad Max: 0.312729
  -> Layer: shared_layers.0.bias | Grad Mean: 0.815796 | Grad Max: 3.508162
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.006880
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001745 | Grad Max: 0.001745
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005482 | Grad Max: 0.678071
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.100227 | Grad Max: 3.719947
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000638 | Grad Max: 0.020359
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.057517 | Grad Max: 0.293532
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.000917
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011532 | Grad Max: 0.022435
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000525
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002863 | Grad Max: 0.008365
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001492 | Grad Max: 0.003633
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042715 | Grad Max: 0.042715
[GRADIENT NORM TOTAL] 17.2773

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.963
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50102365 0.49897632] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.083
[MASKS] A(Pass/Fail): 688/1360 | B: 659/1389 | C: 591/1457
[LOSS Ex1] A: 0.63489 | B: 0.61278 | C: 0.60411
[LOGITS Ex2 A] Mean Abs: 2.242 | Max: 5.597
[LOSS Ex2] A: 0.09617 | B: 0.31125 | C: 0.21797
** [JOINT LOSS] ** : 0.825727
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004017 | Grad Max: 0.090805
  -> Layer: shared_layers.0.bias | Grad Mean: 0.220542 | Grad Max: 1.057100
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.006229
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011158 | Grad Max: 0.011158
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001578 | Grad Max: 0.427672
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028762 | Grad Max: 2.365088
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000169 | Grad Max: 0.006805
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014972 | Grad Max: 0.082522
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000347
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002970 | Grad Max: 0.007981
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000163
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000680 | Grad Max: 0.002098
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000393 | Grad Max: 0.001534
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008739 | Grad Max: 0.008739
[GRADIENT NORM TOTAL] 5.4385

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.861
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54309046 0.4569095 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.080
[MASKS] A(Pass/Fail): 691/1357 | B: 646/1402 | C: 575/1473
[LOSS Ex1] A: 0.63547 | B: 0.61322 | C: 0.61056
[LOGITS Ex2 A] Mean Abs: 2.160 | Max: 6.502
[LOSS Ex2] A: 0.10693 | B: 0.33010 | C: 0.22476
** [JOINT LOSS] ** : 0.840349
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011054 | Grad Max: 0.293140
  -> Layer: shared_layers.0.bias | Grad Mean: 0.897630 | Grad Max: 3.743378
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005659
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010013 | Grad Max: 0.010013
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005526 | Grad Max: 0.693237
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.103252 | Grad Max: 3.722496
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000630 | Grad Max: 0.020459
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.057446 | Grad Max: 0.294239
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000833
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011192 | Grad Max: 0.021596
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000533
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002732 | Grad Max: 0.008231
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001473 | Grad Max: 0.003763
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040755 | Grad Max: 0.040755
[GRADIENT NORM TOTAL] 18.4589

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.078
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8086036  0.19139645] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 751/1297 | B: 654/1394 | C: 597/1451
[LOSS Ex1] A: 0.62895 | B: 0.60873 | C: 0.60985
[LOGITS Ex2 A] Mean Abs: 2.185 | Max: 7.421
[LOSS Ex2] A: 0.10983 | B: 0.32192 | C: 0.20407
** [JOINT LOSS] ** : 0.827784
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011175 | Grad Max: 0.316782
  -> Layer: shared_layers.0.bias | Grad Mean: 0.996867 | Grad Max: 4.236221
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.006094
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004419 | Grad Max: 0.004419
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006133 | Grad Max: 0.734921
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.115342 | Grad Max: 4.077534
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000720 | Grad Max: 0.024475
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.065948 | Grad Max: 0.352022
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.000939
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012628 | Grad Max: 0.025520
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000586
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003011 | Grad Max: 0.009182
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001521 | Grad Max: 0.003923
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043046 | Grad Max: 0.043046
[GRADIENT NORM TOTAL] 20.6800

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.198
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002747 0.4997253] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 724/1324 | B: 613/1243 | C: 392/984
[LOSS Ex1] A: 0.63612 | B: 0.61288 | C: 0.61116
[LOGITS Ex2 A] Mean Abs: 2.224 | Max: 6.732
[LOSS Ex2] A: 0.09169 | B: 0.31189 | C: 0.23384
** [JOINT LOSS] ** : 0.832527
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009426 | Grad Max: 0.253671
  -> Layer: shared_layers.0.bias | Grad Mean: 0.612560 | Grad Max: 2.526103
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.005769
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002872 | Grad Max: 0.002872
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003839 | Grad Max: 0.539015
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070943 | Grad Max: 2.923738
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000455 | Grad Max: 0.015638
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.041105 | Grad Max: 0.216833
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000651
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008114 | Grad Max: 0.016197
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000362
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001973 | Grad Max: 0.005768
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001004 | Grad Max: 0.002587
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028925 | Grad Max: 0.028925
[GRADIENT NORM TOTAL] 12.4163

[EPOCH SUMMARY] Train Loss: 0.8301

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8105 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8114 -> New: 0.8105)

############################## EPOCH 159/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.844
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7371529 0.2628472] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.084
[MASKS] A(Pass/Fail): 714/1334 | B: 659/1389 | C: 603/1445
[LOSS Ex1] A: 0.63130 | B: 0.61268 | C: 0.60653
[LOGITS Ex2 A] Mean Abs: 2.268 | Max: 6.115
[LOSS Ex2] A: 0.11455 | B: 0.31769 | C: 0.23269
** [JOINT LOSS] ** : 0.838482
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003503 | Grad Max: 0.230789
  -> Layer: shared_layers.0.bias | Grad Mean: 0.506110 | Grad Max: 2.973874
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002115 | Grad Max: 0.005449
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007077 | Grad Max: 0.007077
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003203 | Grad Max: 0.761161
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059187 | Grad Max: 4.240593
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000306 | Grad Max: 0.011886
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028544 | Grad Max: 0.157957
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000417
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005456 | Grad Max: 0.011637
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000274
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001308 | Grad Max: 0.004116
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000589 | Grad Max: 0.001883
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018080 | Grad Max: 0.018080
[GRADIENT NORM TOTAL] 12.2036

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.978
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6396761 0.3603239] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 590/1026 | B: 646/1402 | C: 582/1466
[LOSS Ex1] A: 0.62944 | B: 0.61312 | C: 0.60527
[LOGITS Ex2 A] Mean Abs: 2.332 | Max: 8.350
[LOSS Ex2] A: 0.10723 | B: 0.33608 | C: 0.19948
** [JOINT LOSS] ** : 0.830208
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006919 | Grad Max: 0.396834
  -> Layer: shared_layers.0.bias | Grad Mean: 0.927320 | Grad Max: 5.301015
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.005724
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008960 | Grad Max: 0.008960
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005959 | Grad Max: 1.157272
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.111614 | Grad Max: 6.443532
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000609 | Grad Max: 0.021855
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.056698 | Grad Max: 0.303807
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000838
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010893 | Grad Max: 0.021766
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000496
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002638 | Grad Max: 0.007871
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001229 | Grad Max: 0.003008
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036963 | Grad Max: 0.036963
[GRADIENT NORM TOTAL] 21.8708

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.199
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081272 0.4918728] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 723/1325 | B: 654/1394 | C: 585/1463
[LOSS Ex1] A: 0.63018 | B: 0.60864 | C: 0.60773
[LOGITS Ex2 A] Mean Abs: 2.299 | Max: 9.558
[LOSS Ex2] A: 0.09581 | B: 0.30743 | C: 0.24821
** [JOINT LOSS] ** : 0.832664
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007641 | Grad Max: 0.338162
  -> Layer: shared_layers.0.bias | Grad Mean: 0.834787 | Grad Max: 4.455676
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.005305
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003098 | Grad Max: 0.003098
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005438 | Grad Max: 0.899949
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.100932 | Grad Max: 5.022822
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000568 | Grad Max: 0.020620
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.052738 | Grad Max: 0.293145
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000712
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010319 | Grad Max: 0.020872
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000469
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002508 | Grad Max: 0.007524
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001233 | Grad Max: 0.003069
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035961 | Grad Max: 0.035961
[GRADIENT NORM TOTAL] 18.9772

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.131
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50656617 0.4934338 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 720/1328 | B: 613/1243 | C: 602/1446
[LOSS Ex1] A: 0.62686 | B: 0.61279 | C: 0.60848
[LOGITS Ex2 A] Mean Abs: 2.256 | Max: 6.641
[LOSS Ex2] A: 0.10221 | B: 0.29149 | C: 0.22059
** [JOINT LOSS] ** : 0.820810
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002261 | Grad Max: 0.061074
  -> Layer: shared_layers.0.bias | Grad Mean: 0.132298 | Grad Max: 0.762006
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.006564
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007361 | Grad Max: 0.007361
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000907 | Grad Max: 0.323117
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016169 | Grad Max: 1.791312
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.003371
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003509 | Grad Max: 0.037034
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000150
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000720 | Grad Max: 0.003465
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000077
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000208 | Grad Max: 0.000894
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000381 | Grad Max: 0.001065
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003108 | Grad Max: 0.003108
[GRADIENT NORM TOTAL] 4.0573

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.165
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50930864 0.49069136] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 721/1327 | B: 659/1389 | C: 627/1421
[LOSS Ex1] A: 0.62582 | B: 0.61259 | C: 0.60331
[LOGITS Ex2 A] Mean Abs: 2.181 | Max: 7.189
[LOSS Ex2] A: 0.10594 | B: 0.34546 | C: 0.22493
** [JOINT LOSS] ** : 0.839355
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006781 | Grad Max: 0.300614
  -> Layer: shared_layers.0.bias | Grad Mean: 0.799004 | Grad Max: 3.994755
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.006410
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001450 | Grad Max: 0.001450
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005118 | Grad Max: 0.862552
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.095954 | Grad Max: 4.778863
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000554 | Grad Max: 0.018900
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.051502 | Grad Max: 0.275887
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000742
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009892 | Grad Max: 0.020329
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000424
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002432 | Grad Max: 0.006935
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001268 | Grad Max: 0.002935
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036361 | Grad Max: 0.036361
[GRADIENT NORM TOTAL] 17.8803

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.965
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009972  0.49900278] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.083
[MASKS] A(Pass/Fail): 687/1361 | B: 646/1402 | C: 595/1453
[LOSS Ex1] A: 0.63476 | B: 0.61303 | C: 0.61235
[LOGITS Ex2 A] Mean Abs: 2.156 | Max: 5.709
[LOSS Ex2] A: 0.10609 | B: 0.34079 | C: 0.23944
** [JOINT LOSS] ** : 0.848825
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008473 | Grad Max: 0.309253
  -> Layer: shared_layers.0.bias | Grad Mean: 0.894625 | Grad Max: 4.017837
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.005430
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000062 | Grad Max: 0.000062
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005674 | Grad Max: 1.003153
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.106524 | Grad Max: 5.565329
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000637 | Grad Max: 0.022264
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.058999 | Grad Max: 0.320150
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000796
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011386 | Grad Max: 0.022087
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000541
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002810 | Grad Max: 0.008773
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001522 | Grad Max: 0.003485
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042909 | Grad Max: 0.042909
[GRADIENT NORM TOTAL] 19.5582

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.863
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5430183  0.45698175] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.080
[MASKS] A(Pass/Fail): 691/1357 | B: 654/1394 | C: 575/1473
[LOSS Ex1] A: 0.63535 | B: 0.60855 | C: 0.61078
[LOGITS Ex2 A] Mean Abs: 2.165 | Max: 6.020
[LOSS Ex2] A: 0.10657 | B: 0.30608 | C: 0.21916
** [JOINT LOSS] ** : 0.828827
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003550 | Grad Max: 0.179281
  -> Layer: shared_layers.0.bias | Grad Mean: 0.423705 | Grad Max: 2.449785
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005676
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004495 | Grad Max: 0.004495
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002567 | Grad Max: 0.751546
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047745 | Grad Max: 4.159624
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000266 | Grad Max: 0.008659
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024598 | Grad Max: 0.143189
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000364
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004735 | Grad Max: 0.010315
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000255
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001157 | Grad Max: 0.003710
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000593 | Grad Max: 0.002053
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017030 | Grad Max: 0.017030
[GRADIENT NORM TOTAL] 10.4199

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.081
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8091866  0.19081348] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 751/1297 | B: 614/1242 | C: 595/1453
[LOSS Ex1] A: 0.62883 | B: 0.61269 | C: 0.60959
[LOGITS Ex2 A] Mean Abs: 2.265 | Max: 6.028
[LOSS Ex2] A: 0.10398 | B: 0.29693 | C: 0.23802
** [JOINT LOSS] ** : 0.830014
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007396 | Grad Max: 0.174464
  -> Layer: shared_layers.0.bias | Grad Mean: 0.526854 | Grad Max: 2.245493
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.005535
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003248 | Grad Max: 0.003248
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003779 | Grad Max: 0.541816
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069401 | Grad Max: 3.028430
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000424 | Grad Max: 0.013114
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.038754 | Grad Max: 0.197482
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000582
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007735 | Grad Max: 0.015133
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000380
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001896 | Grad Max: 0.005591
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000928 | Grad Max: 0.002465
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026609 | Grad Max: 0.026609
[GRADIENT NORM TOTAL] 11.6302

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.201
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500279   0.49972105] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 723/1325 | B: 659/1389 | C: 597/1451
[LOSS Ex1] A: 0.63601 | B: 0.61250 | C: 0.60227
[LOGITS Ex2 A] Mean Abs: 2.286 | Max: 6.708
[LOSS Ex2] A: 0.09565 | B: 0.31617 | C: 0.22020
** [JOINT LOSS] ** : 0.827595
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007869 | Grad Max: 0.225675
  -> Layer: shared_layers.0.bias | Grad Mean: 0.705917 | Grad Max: 3.113297
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002088 | Grad Max: 0.005317
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004493 | Grad Max: 0.004493
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004707 | Grad Max: 0.583837
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.087936 | Grad Max: 3.267220
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000528 | Grad Max: 0.016457
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048694 | Grad Max: 0.258553
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000739
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009523 | Grad Max: 0.019559
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000458
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002326 | Grad Max: 0.007121
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001177 | Grad Max: 0.002958
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033818 | Grad Max: 0.033818
[GRADIENT NORM TOTAL] 15.5886

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.847
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.73758537 0.2624146 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.085
[MASKS] A(Pass/Fail): 714/1334 | B: 646/1402 | C: 580/1468
[LOSS Ex1] A: 0.63119 | B: 0.61294 | C: 0.60991
[LOGITS Ex2 A] Mean Abs: 2.231 | Max: 7.140
[LOSS Ex2] A: 0.11624 | B: 0.30848 | C: 0.21346
** [JOINT LOSS] ** : 0.830739
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002700 | Grad Max: 0.069846
  -> Layer: shared_layers.0.bias | Grad Mean: 0.243449 | Grad Max: 1.092979
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.005619
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004798 | Grad Max: 0.004798
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001849 | Grad Max: 0.380293
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033533 | Grad Max: 2.115530
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000149 | Grad Max: 0.006632
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013951 | Grad Max: 0.067854
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000220
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002630 | Grad Max: 0.006323
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000136
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000620 | Grad Max: 0.001903
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000396 | Grad Max: 0.001450
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008347 | Grad Max: 0.008347
[GRADIENT NORM TOTAL] 6.4492

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.980
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.63990515 0.36009485] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 590/1026 | B: 654/1394 | C: 613/1435
[LOSS Ex1] A: 0.62933 | B: 0.60845 | C: 0.60240
[LOGITS Ex2 A] Mean Abs: 2.223 | Max: 9.316
[LOSS Ex2] A: 0.11598 | B: 0.30103 | C: 0.19701
** [JOINT LOSS] ** : 0.818068
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008342 | Grad Max: 0.209438
  -> Layer: shared_layers.0.bias | Grad Mean: 0.628049 | Grad Max: 2.808234
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006230
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009408 | Grad Max: 0.009408
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004002 | Grad Max: 0.473613
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074373 | Grad Max: 2.637120
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.015395
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.042116 | Grad Max: 0.220122
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000643
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008297 | Grad Max: 0.016603
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000406
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002027 | Grad Max: 0.006210
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001046 | Grad Max: 0.003079
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029628 | Grad Max: 0.029628
[GRADIENT NORM TOTAL] 13.1303

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.201
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081071 0.4918929] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 723/1325 | B: 614/1242 | C: 564/1484
[LOSS Ex1] A: 0.63007 | B: 0.61260 | C: 0.60957
[LOGITS Ex2 A] Mean Abs: 2.207 | Max: 8.820
[LOSS Ex2] A: 0.09776 | B: 0.32490 | C: 0.20816
** [JOINT LOSS] ** : 0.827684
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009911 | Grad Max: 0.250778
  -> Layer: shared_layers.0.bias | Grad Mean: 0.759239 | Grad Max: 3.540335
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002083 | Grad Max: 0.005569
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002111 | Grad Max: 0.002112
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005097 | Grad Max: 0.582361
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.094263 | Grad Max: 3.214533
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000578 | Grad Max: 0.017370
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.053080 | Grad Max: 0.255801
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000809
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010545 | Grad Max: 0.021792
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000499
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002569 | Grad Max: 0.008079
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001323 | Grad Max: 0.003563
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037080 | Grad Max: 0.037080
[GRADIENT NORM TOTAL] 16.0767

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.133
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50654465 0.4934554 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 720/1328 | B: 659/1389 | C: 584/1464
[LOSS Ex1] A: 0.62675 | B: 0.61241 | C: 0.60694
[LOGITS Ex2 A] Mean Abs: 2.254 | Max: 6.384
[LOSS Ex2] A: 0.09918 | B: 0.32093 | C: 0.22022
** [JOINT LOSS] ** : 0.828809
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003316 | Grad Max: 0.086155
  -> Layer: shared_layers.0.bias | Grad Mean: 0.235363 | Grad Max: 1.112454
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002179 | Grad Max: 0.006439
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001098 | Grad Max: 0.001098
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001851 | Grad Max: 0.231496
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033415 | Grad Max: 1.252288
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000196 | Grad Max: 0.006739
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018283 | Grad Max: 0.108397
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000293
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003663 | Grad Max: 0.008174
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000191
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000927 | Grad Max: 0.002771
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000493 | Grad Max: 0.001837
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014105 | Grad Max: 0.014105
[GRADIENT NORM TOTAL] 5.6994

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.167
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093321  0.49066794] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.084
[MASKS] A(Pass/Fail): 721/1327 | B: 646/1402 | C: 374/1002
[LOSS Ex1] A: 0.62571 | B: 0.61285 | C: 0.60670
[LOGITS Ex2 A] Mean Abs: 2.273 | Max: 6.249
[LOSS Ex2] A: 0.13349 | B: 0.33118 | C: 0.21824
** [JOINT LOSS] ** : 0.842724
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011451 | Grad Max: 0.393806
  -> Layer: shared_layers.0.bias | Grad Mean: 1.050436 | Grad Max: 5.167544
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.005920
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004422 | Grad Max: 0.004422
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006586 | Grad Max: 1.293918
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.122334 | Grad Max: 7.189813
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000708 | Grad Max: 0.023341
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.065541 | Grad Max: 0.327661
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001016
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012916 | Grad Max: 0.026528
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000616
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003176 | Grad Max: 0.009411
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001589 | Grad Max: 0.003524
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045557 | Grad Max: 0.045557
[GRADIENT NORM TOTAL] 23.3377

[EPOCH SUMMARY] Train Loss: 0.8318

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8409 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 160/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.967
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500957   0.49904302] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.083
[MASKS] A(Pass/Fail): 687/1361 | B: 654/1394 | C: 589/1459
[LOSS Ex1] A: 0.63465 | B: 0.60837 | C: 0.60905
[LOGITS Ex2 A] Mean Abs: 2.259 | Max: 5.619
[LOSS Ex2] A: 0.11481 | B: 0.34056 | C: 0.26131
** [JOINT LOSS] ** : 0.856248
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014766 | Grad Max: 0.528722
  -> Layer: shared_layers.0.bias | Grad Mean: 1.455103 | Grad Max: 7.137743
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005487
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004013 | Grad Max: 0.004013
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.009161 | Grad Max: 1.609906
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.170950 | Grad Max: 8.957437
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001000 | Grad Max: 0.033298
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.092770 | Grad Max: 0.492437
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000131 | Grad Max: 0.001247
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.018102 | Grad Max: 0.035314
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000839
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004431 | Grad Max: 0.013405
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002156 | Grad Max: 0.004902
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.063153 | Grad Max: 0.063153
[GRADIENT NORM TOTAL] 32.2260

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.865
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5429642  0.45703575] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.560 | Std: 0.080
[MASKS] A(Pass/Fail): 691/1357 | B: 614/1242 | C: 590/1458
[LOSS Ex1] A: 0.63524 | B: 0.61251 | C: 0.61112
[LOGITS Ex2 A] Mean Abs: 2.246 | Max: 6.637
[LOSS Ex2] A: 0.11994 | B: 0.32622 | C: 0.23867
** [JOINT LOSS] ** : 0.847899
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.013001 | Grad Max: 0.439070
  -> Layer: shared_layers.0.bias | Grad Mean: 1.211040 | Grad Max: 5.760216
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.005286
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003647 | Grad Max: 0.003647
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007689 | Grad Max: 1.314479
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.143750 | Grad Max: 7.307137
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000846 | Grad Max: 0.028756
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.078325 | Grad Max: 0.411641
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001035
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.015296 | Grad Max: 0.029526
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000692
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003781 | Grad Max: 0.010817
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001931 | Grad Max: 0.004127
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.055216 | Grad Max: 0.055216
[GRADIENT NORM TOTAL] 26.5644

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.084
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.80968964 0.19031039] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 751/1297 | B: 660/1388 | C: 583/1465
[LOSS Ex1] A: 0.62872 | B: 0.61234 | C: 0.60595
[LOGITS Ex2 A] Mean Abs: 2.239 | Max: 7.752
[LOSS Ex2] A: 0.10471 | B: 0.32549 | C: 0.21150
** [JOINT LOSS] ** : 0.829570
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006461 | Grad Max: 0.181002
  -> Layer: shared_layers.0.bias | Grad Mean: 0.396987 | Grad Max: 2.303215
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.005805
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001934 | Grad Max: 0.001934
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002903 | Grad Max: 0.533817
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053356 | Grad Max: 2.965971
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000325 | Grad Max: 0.009574
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029477 | Grad Max: 0.136624
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000462
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005888 | Grad Max: 0.011923
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000271
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001469 | Grad Max: 0.004351
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000741 | Grad Max: 0.002188
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021193 | Grad Max: 0.021193
[GRADIENT NORM TOTAL] 9.2871

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.204
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002959  0.49970412] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.083
[MASKS] A(Pass/Fail): 723/1325 | B: 646/1402 | C: 583/1465
[LOSS Ex1] A: 0.63590 | B: 0.61278 | C: 0.60948
[LOGITS Ex2 A] Mean Abs: 2.172 | Max: 6.958
[LOSS Ex2] A: 0.09342 | B: 0.34892 | C: 0.24674
** [JOINT LOSS] ** : 0.849078
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009332 | Grad Max: 0.309701
  -> Layer: shared_layers.0.bias | Grad Mean: 0.882837 | Grad Max: 4.097857
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002053 | Grad Max: 0.005179
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004135 | Grad Max: 0.004135
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005496 | Grad Max: 0.739414
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.102408 | Grad Max: 4.097509
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000626 | Grad Max: 0.020551
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.058563 | Grad Max: 0.301898
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000824
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011509 | Grad Max: 0.022830
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000545
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002852 | Grad Max: 0.008426
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001487 | Grad Max: 0.003620
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043026 | Grad Max: 0.043026
[GRADIENT NORM TOTAL] 18.7568

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.849
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.737969 0.262031] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.085
[MASKS] A(Pass/Fail): 714/1334 | B: 654/1394 | C: 605/1443
[LOSS Ex1] A: 0.63107 | B: 0.60829 | C: 0.60907
[LOGITS Ex2 A] Mean Abs: 2.107 | Max: 6.597
[LOSS Ex2] A: 0.13215 | B: 0.36784 | C: 0.22182
** [JOINT LOSS] ** : 0.856742
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.015805 | Grad Max: 0.440227
  -> Layer: shared_layers.0.bias | Grad Mean: 1.316223 | Grad Max: 5.757745
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005861
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003058 | Grad Max: 0.003058
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008396 | Grad Max: 1.026041
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.156064 | Grad Max: 5.796410
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000976 | Grad Max: 0.031182
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.090391 | Grad Max: 0.469743
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000129 | Grad Max: 0.001280
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.017801 | Grad Max: 0.035178
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000825
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004410 | Grad Max: 0.013092
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002223 | Grad Max: 0.004992
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.064289 | Grad Max: 0.064289
[GRADIENT NORM TOTAL] 27.6736

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.983
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6400871 0.3599129] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 590/1026 | B: 614/1242 | C: 592/1456
[LOSS Ex1] A: 0.62921 | B: 0.61244 | C: 0.60802
[LOGITS Ex2 A] Mean Abs: 2.168 | Max: 9.929
[LOSS Ex2] A: 0.11207 | B: 0.35828 | C: 0.21036
** [JOINT LOSS] ** : 0.843459
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012903 | Grad Max: 0.388497
  -> Layer: shared_layers.0.bias | Grad Mean: 1.140571 | Grad Max: 5.111253
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.006220
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000286 | Grad Max: 0.000286
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007509 | Grad Max: 0.987360
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.140045 | Grad Max: 5.508556
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000855 | Grad Max: 0.027912
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.079450 | Grad Max: 0.416290
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000113 | Grad Max: 0.001118
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.015560 | Grad Max: 0.031546
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000703
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003863 | Grad Max: 0.011342
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001891 | Grad Max: 0.004119
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.055518 | Grad Max: 0.055518
[GRADIENT NORM TOTAL] 24.9343

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.205
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50813186 0.49186814] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 723/1325 | B: 660/1388 | C: 580/1468
[LOSS Ex1] A: 0.62995 | B: 0.61227 | C: 0.60662
[LOGITS Ex2 A] Mean Abs: 2.185 | Max: 9.265
[LOSS Ex2] A: 0.09490 | B: 0.32593 | C: 0.21840
** [JOINT LOSS] ** : 0.829357
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007070 | Grad Max: 0.235820
  -> Layer: shared_layers.0.bias | Grad Mean: 0.624730 | Grad Max: 3.197259
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.005454
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000759 | Grad Max: 0.000759
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003969 | Grad Max: 0.727640
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.072888 | Grad Max: 4.093098
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000426 | Grad Max: 0.013317
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039477 | Grad Max: 0.194243
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000583
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007901 | Grad Max: 0.015931
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000438
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001961 | Grad Max: 0.006480
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000970 | Grad Max: 0.002995
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028222 | Grad Max: 0.028222
[GRADIENT NORM TOTAL] 13.9348

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.136
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5064795  0.49352053] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 720/1328 | B: 646/1402 | C: 592/1456
[LOSS Ex1] A: 0.62663 | B: 0.61271 | C: 0.60729
[LOGITS Ex2 A] Mean Abs: 2.234 | Max: 5.805
[LOSS Ex2] A: 0.11035 | B: 0.31355 | C: 0.23735
** [JOINT LOSS] ** : 0.835960
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008281 | Grad Max: 0.271023
  -> Layer: shared_layers.0.bias | Grad Mean: 0.799141 | Grad Max: 3.543289
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.005811
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002548 | Grad Max: 0.002548
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005197 | Grad Max: 0.795315
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.097112 | Grad Max: 4.401843
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000589 | Grad Max: 0.018592
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.054990 | Grad Max: 0.276612
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000817
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010742 | Grad Max: 0.022312
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000467
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002671 | Grad Max: 0.007599
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001274 | Grad Max: 0.003312
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037991 | Grad Max: 0.037991
[GRADIENT NORM TOTAL] 17.3426

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.170
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50940764 0.4905924 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 721/1327 | B: 654/1394 | C: 578/1470
[LOSS Ex1] A: 0.62560 | B: 0.60822 | C: 0.60714
[LOGITS Ex2 A] Mean Abs: 2.255 | Max: 8.633
[LOSS Ex2] A: 0.13043 | B: 0.34194 | C: 0.24391
** [JOINT LOSS] ** : 0.852412
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014919 | Grad Max: 0.439405
  -> Layer: shared_layers.0.bias | Grad Mean: 1.304749 | Grad Max: 5.844718
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.006568
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001194 | Grad Max: 0.001194
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008635 | Grad Max: 1.220349
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.160856 | Grad Max: 6.852180
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000968 | Grad Max: 0.031964
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.089790 | Grad Max: 0.483073
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000128 | Grad Max: 0.001206
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.017775 | Grad Max: 0.034101
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000811
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004430 | Grad Max: 0.013120
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002164 | Grad Max: 0.004465
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.063131 | Grad Max: 0.063131
[GRADIENT NORM TOTAL] 28.5682

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.970
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50096303 0.499037  ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.083
[MASKS] A(Pass/Fail): 687/1361 | B: 614/1242 | C: 588/1460
[LOSS Ex1] A: 0.63454 | B: 0.61237 | C: 0.60969
[LOGITS Ex2 A] Mean Abs: 2.231 | Max: 5.392
[LOSS Ex2] A: 0.10552 | B: 0.31439 | C: 0.23968
** [JOINT LOSS] ** : 0.838733
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009013 | Grad Max: 0.374025
  -> Layer: shared_layers.0.bias | Grad Mean: 0.997467 | Grad Max: 4.787257
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002050 | Grad Max: 0.005419
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005730 | Grad Max: 0.005730
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006346 | Grad Max: 0.856108
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.118237 | Grad Max: 4.718607
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000685 | Grad Max: 0.023632
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.064520 | Grad Max: 0.343666
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000839
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012734 | Grad Max: 0.024544
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000606
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003253 | Grad Max: 0.009651
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001553 | Grad Max: 0.003758
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047211 | Grad Max: 0.047211
[GRADIENT NORM TOTAL] 21.5971

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.867
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5429509  0.45704907] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.080
[MASKS] A(Pass/Fail): 691/1357 | B: 660/1388 | C: 646/1402
[LOSS Ex1] A: 0.63515 | B: 0.61220 | C: 0.59839
[LOGITS Ex2 A] Mean Abs: 2.175 | Max: 5.952
[LOSS Ex2] A: 0.11468 | B: 0.31766 | C: 0.21599
** [JOINT LOSS] ** : 0.831356
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002764 | Grad Max: 0.127330
  -> Layer: shared_layers.0.bias | Grad Mean: 0.279675 | Grad Max: 1.651603
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.006020
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010517 | Grad Max: 0.010517
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001784 | Grad Max: 0.248521
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032997 | Grad Max: 1.385486
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000202 | Grad Max: 0.007199
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018817 | Grad Max: 0.115625
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000297
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003639 | Grad Max: 0.008316
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000199
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000900 | Grad Max: 0.002734
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000433 | Grad Max: 0.001558
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012725 | Grad Max: 0.012725
[GRADIENT NORM TOTAL] 5.9725

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.81014526 0.18985473] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.085
[MASKS] A(Pass/Fail): 751/1297 | B: 646/1402 | C: 573/1475
[LOSS Ex1] A: 0.62862 | B: 0.61265 | C: 0.60716
[LOGITS Ex2 A] Mean Abs: 2.126 | Max: 7.430
[LOSS Ex2] A: 0.10380 | B: 0.35090 | C: 0.23709
** [JOINT LOSS] ** : 0.846737
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008511 | Grad Max: 0.334128
  -> Layer: shared_layers.0.bias | Grad Mean: 0.929512 | Grad Max: 4.536291
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.005503
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000658 | Grad Max: 0.000658
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005888 | Grad Max: 0.986504
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.110723 | Grad Max: 5.463632
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000632 | Grad Max: 0.020752
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.058991 | Grad Max: 0.300397
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000799
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011462 | Grad Max: 0.022680
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000547
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002869 | Grad Max: 0.008912
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001436 | Grad Max: 0.003300
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042100 | Grad Max: 0.042100
[GRADIENT NORM TOTAL] 20.6991

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.206
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003158  0.49968415] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.084
[MASKS] A(Pass/Fail): 723/1325 | B: 655/1393 | C: 615/1433
[LOSS Ex1] A: 0.63580 | B: 0.60816 | C: 0.60573
[LOGITS Ex2 A] Mean Abs: 2.106 | Max: 5.678
[LOSS Ex2] A: 0.10618 | B: 0.37697 | C: 0.24629
** [JOINT LOSS] ** : 0.859710
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.013417 | Grad Max: 0.471862
  -> Layer: shared_layers.0.bias | Grad Mean: 1.367699 | Grad Max: 6.252672
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005121
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000523 | Grad Max: 0.000523
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008540 | Grad Max: 1.280424
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.159934 | Grad Max: 7.092551
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000961 | Grad Max: 0.032648
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.089994 | Grad Max: 0.468748
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000126 | Grad Max: 0.001174
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.017661 | Grad Max: 0.033426
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000830
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004421 | Grad Max: 0.012819
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002184 | Grad Max: 0.004940
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.064705 | Grad Max: 0.064705
[GRADIENT NORM TOTAL] 29.3540

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.850
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7383021  0.26169786] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.085
[MASKS] A(Pass/Fail): 714/1334 | B: 614/1242 | C: 418/958
[LOSS Ex1] A: 0.63097 | B: 0.61231 | C: 0.60197
[LOGITS Ex2 A] Mean Abs: 2.062 | Max: 6.724
[LOSS Ex2] A: 0.12518 | B: 0.37553 | C: 0.20809
** [JOINT LOSS] ** : 0.851350
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014967 | Grad Max: 0.432222
  -> Layer: shared_layers.0.bias | Grad Mean: 1.340253 | Grad Max: 5.807350
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005519
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004659 | Grad Max: 0.004659
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008464 | Grad Max: 1.121580
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.158909 | Grad Max: 6.241195
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000975 | Grad Max: 0.031505
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.090971 | Grad Max: 0.468691
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000129 | Grad Max: 0.001203
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.017844 | Grad Max: 0.034222
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000815
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004473 | Grad Max: 0.013015
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002194 | Grad Max: 0.004686
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.064594 | Grad Max: 0.064594
[GRADIENT NORM TOTAL] 28.3901

[EPOCH SUMMARY] Train Loss: 0.8449

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8220 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 161/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.984
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64020723 0.35979277] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 590/1026 | B: 660/1388 | C: 581/1467
[LOSS Ex1] A: 0.62911 | B: 0.61215 | C: 0.60877
[LOGITS Ex2 A] Mean Abs: 2.146 | Max: 8.113
[LOSS Ex2] A: 0.11459 | B: 0.34426 | C: 0.24992
** [JOINT LOSS] ** : 0.852935
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008828 | Grad Max: 0.303019
  -> Layer: shared_layers.0.bias | Grad Mean: 0.855269 | Grad Max: 4.078129
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.005974
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006967 | Grad Max: 0.006968
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005205 | Grad Max: 0.942813
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.097326 | Grad Max: 5.238788
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000573 | Grad Max: 0.019187
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.053501 | Grad Max: 0.269647
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000809
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010491 | Grad Max: 0.022177
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000433
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002669 | Grad Max: 0.007078
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001371 | Grad Max: 0.003093
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040277 | Grad Max: 0.040277
[GRADIENT NORM TOTAL] 18.6789

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.207
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50815976 0.49184027] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 723/1325 | B: 647/1401 | C: 614/1434
[LOSS Ex1] A: 0.62986 | B: 0.61259 | C: 0.60282
[LOGITS Ex2 A] Mean Abs: 2.208 | Max: 7.621
[LOSS Ex2] A: 0.09722 | B: 0.31477 | C: 0.19234
** [JOINT LOSS] ** : 0.816530
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006425 | Grad Max: 0.176928
  -> Layer: shared_layers.0.bias | Grad Mean: 0.468144 | Grad Max: 2.309814
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.005651
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000019 | Grad Max: 0.000019
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003155 | Grad Max: 0.491187
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058031 | Grad Max: 2.751588
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000324 | Grad Max: 0.009412
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029770 | Grad Max: 0.139437
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000480
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006119 | Grad Max: 0.012080
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000319
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001559 | Grad Max: 0.004746
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000752 | Grad Max: 0.002619
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022316 | Grad Max: 0.022316
[GRADIENT NORM TOTAL] 10.6617

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.138
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50639325 0.49360672] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 720/1328 | B: 656/1392 | C: 586/1462
[LOSS Ex1] A: 0.62654 | B: 0.60810 | C: 0.60782
[LOGITS Ex2 A] Mean Abs: 2.217 | Max: 5.414
[LOSS Ex2] A: 0.11318 | B: 0.31705 | C: 0.21124
** [JOINT LOSS] ** : 0.827977
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010309 | Grad Max: 0.356536
  -> Layer: shared_layers.0.bias | Grad Mean: 0.941553 | Grad Max: 4.747566
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002281 | Grad Max: 0.006598
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005705 | Grad Max: 0.005705
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006171 | Grad Max: 0.940999
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.114857 | Grad Max: 5.266811
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000653 | Grad Max: 0.019791
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.061291 | Grad Max: 0.299631
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.000816
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012200 | Grad Max: 0.023819
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000546
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003102 | Grad Max: 0.008790
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001522 | Grad Max: 0.003815
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045557 | Grad Max: 0.045557
[GRADIENT NORM TOTAL] 21.1141

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.172
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50947475 0.49052528] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 721/1327 | B: 614/1242 | C: 592/1456
[LOSS Ex1] A: 0.62550 | B: 0.61225 | C: 0.60975
[LOGITS Ex2 A] Mean Abs: 2.178 | Max: 5.411
[LOSS Ex2] A: 0.12588 | B: 0.30728 | C: 0.22997
** [JOINT LOSS] ** : 0.836878
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010469 | Grad Max: 0.291542
  -> Layer: shared_layers.0.bias | Grad Mean: 0.802743 | Grad Max: 3.777721
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006013
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005749 | Grad Max: 0.005749
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005240 | Grad Max: 0.827562
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.096409 | Grad Max: 4.623919
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000567 | Grad Max: 0.016998
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.052398 | Grad Max: 0.266506
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000871
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010680 | Grad Max: 0.022067
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000469
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002735 | Grad Max: 0.007766
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001330 | Grad Max: 0.003371
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039651 | Grad Max: 0.039651
[GRADIENT NORM TOTAL] 17.6313

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.971
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009693  0.49903068] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.084
[MASKS] A(Pass/Fail): 687/1361 | B: 660/1388 | C: 594/1454
[LOSS Ex1] A: 0.63445 | B: 0.61209 | C: 0.60793
[LOGITS Ex2 A] Mean Abs: 2.134 | Max: 7.129
[LOSS Ex2] A: 0.10418 | B: 0.32003 | C: 0.21101
** [JOINT LOSS] ** : 0.829898
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002745 | Grad Max: 0.099065
  -> Layer: shared_layers.0.bias | Grad Mean: 0.139755 | Grad Max: 0.930132
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005322
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005786 | Grad Max: 0.005786
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000966 | Grad Max: 0.201411
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017405 | Grad Max: 1.122477
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000089 | Grad Max: 0.003535
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007786 | Grad Max: 0.052553
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000193
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001603 | Grad Max: 0.004293
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000118
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000438 | Grad Max: 0.001602
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000478 | Grad Max: 0.001383
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006035 | Grad Max: 0.006035
[GRADIENT NORM TOTAL] 3.3941

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.868
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5429203  0.45707968] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.080
[MASKS] A(Pass/Fail): 691/1357 | B: 647/1401 | C: 584/1464
[LOSS Ex1] A: 0.63506 | B: 0.61253 | C: 0.60756
[LOGITS Ex2 A] Mean Abs: 2.024 | Max: 5.942
[LOSS Ex2] A: 0.11537 | B: 0.33888 | C: 0.22309
** [JOINT LOSS] ** : 0.844164
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011673 | Grad Max: 0.308583
  -> Layer: shared_layers.0.bias | Grad Mean: 0.910820 | Grad Max: 4.084969
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.005329
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005464 | Grad Max: 0.005464
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006029 | Grad Max: 1.060966
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.112320 | Grad Max: 5.882243
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000667 | Grad Max: 0.021103
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.062110 | Grad Max: 0.322762
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000814
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012388 | Grad Max: 0.023435
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000617
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003110 | Grad Max: 0.009519
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001556 | Grad Max: 0.003902
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045070 | Grad Max: 0.045070
[GRADIENT NORM TOTAL] 20.1214

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.088
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.81055164 0.18944834] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.085
[MASKS] A(Pass/Fail): 751/1297 | B: 656/1392 | C: 595/1453
[LOSS Ex1] A: 0.62853 | B: 0.60804 | C: 0.60840
[LOGITS Ex2 A] Mean Abs: 2.063 | Max: 6.368
[LOSS Ex2] A: 0.10949 | B: 0.35395 | C: 0.25378
** [JOINT LOSS] ** : 0.854063
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012816 | Grad Max: 0.396530
  -> Layer: shared_layers.0.bias | Grad Mean: 1.175350 | Grad Max: 5.188452
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002162 | Grad Max: 0.005944
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000872 | Grad Max: 0.000872
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007643 | Grad Max: 1.182893
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.142581 | Grad Max: 6.539235
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000846 | Grad Max: 0.027465
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.079376 | Grad Max: 0.412340
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000113 | Grad Max: 0.001093
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.015857 | Grad Max: 0.030678
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000767
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004049 | Grad Max: 0.012402
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001992 | Grad Max: 0.004314
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.059023 | Grad Max: 0.059023
[GRADIENT NORM TOTAL] 25.5983

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003362  0.49966374] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.084
[MASKS] A(Pass/Fail): 723/1325 | B: 614/1242 | C: 614/1434
[LOSS Ex1] A: 0.63572 | B: 0.61219 | C: 0.60535
[LOGITS Ex2 A] Mean Abs: 2.080 | Max: 5.536
[LOSS Ex2] A: 0.10526 | B: 0.33345 | C: 0.21694
** [JOINT LOSS] ** : 0.836299
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011269 | Grad Max: 0.324552
  -> Layer: shared_layers.0.bias | Grad Mean: 0.931066 | Grad Max: 4.462290
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.005596
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000423 | Grad Max: 0.000423
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005997 | Grad Max: 0.969616
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.111456 | Grad Max: 5.369363
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000659 | Grad Max: 0.022469
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.061692 | Grad Max: 0.317006
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000846
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012402 | Grad Max: 0.024315
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000670
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003118 | Grad Max: 0.010489
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001528 | Grad Max: 0.003966
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044529 | Grad Max: 0.044529
[GRADIENT NORM TOTAL] 20.2734

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.852
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7386077 0.2613923] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.085
[MASKS] A(Pass/Fail): 714/1334 | B: 660/1388 | C: 583/1465
[LOSS Ex1] A: 0.63088 | B: 0.61203 | C: 0.60879
[LOGITS Ex2 A] Mean Abs: 2.123 | Max: 6.623
[LOSS Ex2] A: 0.11592 | B: 0.31812 | C: 0.20940
** [JOINT LOSS] ** : 0.831713
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005207 | Grad Max: 0.151768
  -> Layer: shared_layers.0.bias | Grad Mean: 0.235579 | Grad Max: 1.257832
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005258
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004167 | Grad Max: 0.004167
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001646 | Grad Max: 0.438887
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029822 | Grad Max: 2.465098
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000135 | Grad Max: 0.005953
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011795 | Grad Max: 0.070598
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000292
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002361 | Grad Max: 0.005591
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000169
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000596 | Grad Max: 0.002076
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000422 | Grad Max: 0.001787
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008953 | Grad Max: 0.008953
[GRADIENT NORM TOTAL] 5.8254

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.986
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6403089 0.3596911] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 590/1026 | B: 647/1401 | C: 604/1444
[LOSS Ex1] A: 0.62902 | B: 0.61246 | C: 0.60556
[LOGITS Ex2 A] Mean Abs: 2.235 | Max: 9.290
[LOSS Ex2] A: 0.11120 | B: 0.34056 | C: 0.22825
** [JOINT LOSS] ** : 0.842349
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009533 | Grad Max: 0.358694
  -> Layer: shared_layers.0.bias | Grad Mean: 0.988426 | Grad Max: 4.726862
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.006107
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002286 | Grad Max: 0.002286
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006286 | Grad Max: 0.951131
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.117165 | Grad Max: 5.321892
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000673 | Grad Max: 0.021844
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.063575 | Grad Max: 0.327912
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000886
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012625 | Grad Max: 0.025510
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000552
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003189 | Grad Max: 0.009560
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001485 | Grad Max: 0.003679
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044985 | Grad Max: 0.044985
[GRADIENT NORM TOTAL] 21.8177

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.210
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081578  0.49184218] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 723/1325 | B: 656/1392 | C: 576/1472
[LOSS Ex1] A: 0.62976 | B: 0.60798 | C: 0.60975
[LOGITS Ex2 A] Mean Abs: 2.240 | Max: 8.202
[LOSS Ex2] A: 0.11755 | B: 0.35946 | C: 0.24298
** [JOINT LOSS] ** : 0.855824
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.015140 | Grad Max: 0.540683
  -> Layer: shared_layers.0.bias | Grad Mean: 1.474462 | Grad Max: 7.200821
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.005681
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006735 | Grad Max: 0.006735
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.009487 | Grad Max: 1.556246
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.176526 | Grad Max: 8.655821
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001022 | Grad Max: 0.032375
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.096072 | Grad Max: 0.487057
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000137 | Grad Max: 0.001331
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.019191 | Grad Max: 0.037839
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000056 | Grad Max: 0.000881
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004895 | Grad Max: 0.014409
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002379 | Grad Max: 0.004922
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.070518 | Grad Max: 0.070518
[GRADIENT NORM TOTAL] 32.6617

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.140
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063253  0.49367473] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 719/1329 | B: 614/1242 | C: 610/1438
[LOSS Ex1] A: 0.62644 | B: 0.61212 | C: 0.60542
[LOGITS Ex2 A] Mean Abs: 2.230 | Max: 7.251
[LOSS Ex2] A: 0.12320 | B: 0.34058 | C: 0.25586
** [JOINT LOSS] ** : 0.854543
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.015828 | Grad Max: 0.480430
  -> Layer: shared_layers.0.bias | Grad Mean: 1.332058 | Grad Max: 6.388570
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.006254
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000640 | Grad Max: 0.000640
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008483 | Grad Max: 1.449577
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.157754 | Grad Max: 8.069330
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000918 | Grad Max: 0.030032
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.086027 | Grad Max: 0.455311
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000124 | Grad Max: 0.001213
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.017284 | Grad Max: 0.034699
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000839
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004437 | Grad Max: 0.013258
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002193 | Grad Max: 0.004876
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.063962 | Grad Max: 0.063962
[GRADIENT NORM TOTAL] 29.2266

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.175
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5095419  0.49045804] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 722/1326 | B: 660/1388 | C: 612/1436
[LOSS Ex1] A: 0.62541 | B: 0.61197 | C: 0.60031
[LOGITS Ex2 A] Mean Abs: 2.181 | Max: 5.593
[LOSS Ex2] A: 0.12466 | B: 0.32195 | C: 0.22295
** [JOINT LOSS] ** : 0.835747
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008486 | Grad Max: 0.194835
  -> Layer: shared_layers.0.bias | Grad Mean: 0.635235 | Grad Max: 2.800750
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.006457
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008632 | Grad Max: 0.008632
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004172 | Grad Max: 0.551481
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.077279 | Grad Max: 3.059848
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000460 | Grad Max: 0.014970
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.042767 | Grad Max: 0.213320
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000688
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008602 | Grad Max: 0.017739
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000410
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002144 | Grad Max: 0.006097
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001021 | Grad Max: 0.002669
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029596 | Grad Max: 0.029596
[GRADIENT NORM TOTAL] 13.5706

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.973
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50098044 0.49901956] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.084
[MASKS] A(Pass/Fail): 687/1361 | B: 647/1401 | C: 405/971
[LOSS Ex1] A: 0.63435 | B: 0.61240 | C: 0.60750
[LOGITS Ex2 A] Mean Abs: 2.091 | Max: 5.206
[LOSS Ex2] A: 0.10427 | B: 0.33059 | C: 0.24025
** [JOINT LOSS] ** : 0.843120
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005556 | Grad Max: 0.212607
  -> Layer: shared_layers.0.bias | Grad Mean: 0.562217 | Grad Max: 2.663414
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005431
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006201 | Grad Max: 0.006201
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003592 | Grad Max: 0.687723
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.066616 | Grad Max: 3.858468
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000380 | Grad Max: 0.013010
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036071 | Grad Max: 0.199566
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000562
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007284 | Grad Max: 0.015156
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000390
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001897 | Grad Max: 0.005873
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000975 | Grad Max: 0.002784
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028842 | Grad Max: 0.028842
[GRADIENT NORM TOTAL] 12.4873

[EPOCH SUMMARY] Train Loss: 0.8401

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8339 | Alpha: 0.5500
No improve count: 3/15

############################## EPOCH 162/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.870
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.542831 0.457169] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.080
[MASKS] A(Pass/Fail): 691/1357 | B: 657/1391 | C: 621/1427
[LOSS Ex1] A: 0.63496 | B: 0.60792 | C: 0.60527
[LOGITS Ex2 A] Mean Abs: 2.006 | Max: 5.724
[LOSS Ex2] A: 0.11886 | B: 0.33834 | C: 0.22720
** [JOINT LOSS] ** : 0.844180
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011118 | Grad Max: 0.334421
  -> Layer: shared_layers.0.bias | Grad Mean: 1.021909 | Grad Max: 4.495038
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.005670
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008070 | Grad Max: 0.008070
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006467 | Grad Max: 0.859121
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.120986 | Grad Max: 4.828690
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000732 | Grad Max: 0.024013
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.068903 | Grad Max: 0.361888
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000098 | Grad Max: 0.000981
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013702 | Grad Max: 0.027774
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000633
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003512 | Grad Max: 0.010810
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001731 | Grad Max: 0.003982
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050764 | Grad Max: 0.050764
[GRADIENT NORM TOTAL] 21.7739

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.091
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.81092757 0.1890724 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.085
[MASKS] A(Pass/Fail): 751/1297 | B: 614/1242 | C: 588/1460
[LOSS Ex1] A: 0.62843 | B: 0.61207 | C: 0.60357
[LOGITS Ex2 A] Mean Abs: 2.076 | Max: 6.226
[LOSS Ex2] A: 0.10353 | B: 0.33489 | C: 0.22732
** [JOINT LOSS] ** : 0.836607
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007212 | Grad Max: 0.311165
  -> Layer: shared_layers.0.bias | Grad Mean: 0.899818 | Grad Max: 4.124870
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.005807
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004439 | Grad Max: 0.004439
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005582 | Grad Max: 0.788126
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.104532 | Grad Max: 4.409972
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000627 | Grad Max: 0.020331
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.059922 | Grad Max: 0.311315
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000801
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011872 | Grad Max: 0.023300
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000563
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003030 | Grad Max: 0.009520
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001456 | Grad Max: 0.003558
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043182 | Grad Max: 0.043182
[GRADIENT NORM TOTAL] 19.2567

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.211
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003988 0.4996012] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.084
[MASKS] A(Pass/Fail): 723/1325 | B: 660/1388 | C: 595/1453
[LOSS Ex1] A: 0.63562 | B: 0.61191 | C: 0.60738
[LOGITS Ex2 A] Mean Abs: 2.126 | Max: 6.378
[LOSS Ex2] A: 0.09523 | B: 0.31972 | C: 0.21826
** [JOINT LOSS] ** : 0.829373
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003212 | Grad Max: 0.131001
  -> Layer: shared_layers.0.bias | Grad Mean: 0.347285 | Grad Max: 1.617949
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002028 | Grad Max: 0.005381
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004503 | Grad Max: 0.004503
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.297631
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039684 | Grad Max: 1.639256
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000230 | Grad Max: 0.009496
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021822 | Grad Max: 0.131124
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000360
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004428 | Grad Max: 0.009735
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000258
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001143 | Grad Max: 0.003711
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000565 | Grad Max: 0.002340
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016450 | Grad Max: 0.016450
[GRADIENT NORM TOTAL] 7.4713

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.853
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7388318  0.26116812] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.085
[MASKS] A(Pass/Fail): 713/1335 | B: 648/1400 | C: 607/1441
[LOSS Ex1] A: 0.63077 | B: 0.61235 | C: 0.60340
[LOGITS Ex2 A] Mean Abs: 2.192 | Max: 6.114
[LOSS Ex2] A: 0.12217 | B: 0.33165 | C: 0.23210
** [JOINT LOSS] ** : 0.844144
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009215 | Grad Max: 0.356028
  -> Layer: shared_layers.0.bias | Grad Mean: 0.922336 | Grad Max: 4.744699
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.006019
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000592 | Grad Max: 0.000592
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005878 | Grad Max: 0.865191
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.109741 | Grad Max: 4.845566
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000622 | Grad Max: 0.021071
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.059067 | Grad Max: 0.305509
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000818
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011830 | Grad Max: 0.023053
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000594
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003051 | Grad Max: 0.008826
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001493 | Grad Max: 0.003624
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044148 | Grad Max: 0.044148
[GRADIENT NORM TOTAL] 20.5151

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.988
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64034617 0.35965377] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 590/1026 | B: 657/1391 | C: 578/1470
[LOSS Ex1] A: 0.62891 | B: 0.60786 | C: 0.61011
[LOGITS Ex2 A] Mean Abs: 2.245 | Max: 9.645
[LOSS Ex2] A: 0.11549 | B: 0.34977 | C: 0.23964
** [JOINT LOSS] ** : 0.850594
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012852 | Grad Max: 0.532753
  -> Layer: shared_layers.0.bias | Grad Mean: 1.339190 | Grad Max: 7.069860
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002205 | Grad Max: 0.006310
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011326 | Grad Max: 0.011326
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008578 | Grad Max: 1.427239
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.160076 | Grad Max: 7.932341
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000901 | Grad Max: 0.031890
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.085697 | Grad Max: 0.444281
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000120 | Grad Max: 0.001127
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.017171 | Grad Max: 0.032289
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000815
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004422 | Grad Max: 0.013307
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002112 | Grad Max: 0.004443
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.063003 | Grad Max: 0.063003
[GRADIENT NORM TOTAL] 29.9756

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.212
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50821733 0.4917827 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 723/1325 | B: 614/1242 | C: 614/1434
[LOSS Ex1] A: 0.62966 | B: 0.61201 | C: 0.60687
[LOGITS Ex2 A] Mean Abs: 2.186 | Max: 7.123
[LOSS Ex2] A: 0.11596 | B: 0.34282 | C: 0.26388
** [JOINT LOSS] ** : 0.857064
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.013117 | Grad Max: 0.489573
  -> Layer: shared_layers.0.bias | Grad Mean: 1.259133 | Grad Max: 6.520995
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.005363
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000896 | Grad Max: 0.000896
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008067 | Grad Max: 1.342071
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.150090 | Grad Max: 7.454862
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000850 | Grad Max: 0.030287
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.080491 | Grad Max: 0.435469
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000115 | Grad Max: 0.001143
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.016223 | Grad Max: 0.032338
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000779
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004197 | Grad Max: 0.012363
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002049 | Grad Max: 0.004386
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.060341 | Grad Max: 0.060341
[GRADIENT NORM TOTAL] 27.8994

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.142
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062383  0.49376172] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 719/1329 | B: 660/1388 | C: 562/1486
[LOSS Ex1] A: 0.62634 | B: 0.61186 | C: 0.60751
[LOGITS Ex2 A] Mean Abs: 2.176 | Max: 5.464
[LOSS Ex2] A: 0.11142 | B: 0.32671 | C: 0.22425
** [JOINT LOSS] ** : 0.836029
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007207 | Grad Max: 0.246022
  -> Layer: shared_layers.0.bias | Grad Mean: 0.682596 | Grad Max: 3.156439
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.006394
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002524 | Grad Max: 0.002524
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004121 | Grad Max: 0.653742
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.077226 | Grad Max: 3.650321
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000457 | Grad Max: 0.016062
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.043596 | Grad Max: 0.240499
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000630
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008715 | Grad Max: 0.017427
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000437
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002274 | Grad Max: 0.006816
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001086 | Grad Max: 0.002959
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032871 | Grad Max: 0.032871
[GRADIENT NORM TOTAL] 14.6344

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.177
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50963676 0.49036324] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 722/1326 | B: 648/1400 | C: 611/1437
[LOSS Ex1] A: 0.62530 | B: 0.61230 | C: 0.61047
[LOGITS Ex2 A] Mean Abs: 2.063 | Max: 5.504
[LOSS Ex2] A: 0.11858 | B: 0.32475 | C: 0.22959
** [JOINT LOSS] ** : 0.840331
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004574 | Grad Max: 0.196008
  -> Layer: shared_layers.0.bias | Grad Mean: 0.477681 | Grad Max: 2.605604
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.006470
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002137 | Grad Max: 0.002137
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002926 | Grad Max: 0.709886
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054695 | Grad Max: 3.947270
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000312 | Grad Max: 0.012094
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029425 | Grad Max: 0.171063
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000436
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005718 | Grad Max: 0.011408
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000305
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001475 | Grad Max: 0.004619
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000756 | Grad Max: 0.002420
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022162 | Grad Max: 0.022162
[GRADIENT NORM TOTAL] 11.0133

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.975
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010113  0.49898872] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.084
[MASKS] A(Pass/Fail): 687/1361 | B: 657/1391 | C: 585/1463
[LOSS Ex1] A: 0.63425 | B: 0.60781 | C: 0.60972
[LOGITS Ex2 A] Mean Abs: 2.008 | Max: 5.402
[LOSS Ex2] A: 0.10694 | B: 0.33401 | C: 0.22264
** [JOINT LOSS] ** : 0.838458
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011106 | Grad Max: 0.307201
  -> Layer: shared_layers.0.bias | Grad Mean: 0.935803 | Grad Max: 4.136335
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.006077
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009747 | Grad Max: 0.009747
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006169 | Grad Max: 1.099012
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.115017 | Grad Max: 6.086195
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000675 | Grad Max: 0.022288
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.063275 | Grad Max: 0.344222
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.000926
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012735 | Grad Max: 0.024862
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000671
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003269 | Grad Max: 0.010557
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001617 | Grad Max: 0.004285
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046703 | Grad Max: 0.046703
[GRADIENT NORM TOTAL] 20.9890

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.871
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54276896 0.45723101] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.081
[MASKS] A(Pass/Fail): 691/1357 | B: 614/1242 | C: 646/1402
[LOSS Ex1] A: 0.63486 | B: 0.61196 | C: 0.60007
[LOGITS Ex2 A] Mean Abs: 2.006 | Max: 5.734
[LOSS Ex2] A: 0.11773 | B: 0.33056 | C: 0.20510
** [JOINT LOSS] ** : 0.833429
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010899 | Grad Max: 0.282616
  -> Layer: shared_layers.0.bias | Grad Mean: 0.859424 | Grad Max: 3.826611
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005656
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004196 | Grad Max: 0.004196
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005676 | Grad Max: 0.903911
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.105407 | Grad Max: 5.017396
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000618 | Grad Max: 0.021146
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.057871 | Grad Max: 0.291765
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000843
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011759 | Grad Max: 0.023548
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000607
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003016 | Grad Max: 0.010054
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001483 | Grad Max: 0.003936
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042652 | Grad Max: 0.042652
[GRADIENT NORM TOTAL] 19.0446

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.093
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.81127405 0.18872589] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.085
[MASKS] A(Pass/Fail): 751/1297 | B: 660/1388 | C: 579/1469
[LOSS Ex1] A: 0.62834 | B: 0.61181 | C: 0.60500
[LOGITS Ex2 A] Mean Abs: 2.089 | Max: 7.652
[LOSS Ex2] A: 0.10306 | B: 0.32171 | C: 0.20187
** [JOINT LOSS] ** : 0.823928
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.123583
  -> Layer: shared_layers.0.bias | Grad Mean: 0.230572 | Grad Max: 1.661487
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.005871
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005600 | Grad Max: 0.005600
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001394 | Grad Max: 0.536765
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.025252 | Grad Max: 3.004112
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000105 | Grad Max: 0.006449
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009797 | Grad Max: 0.074446
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000194
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001869 | Grad Max: 0.005392
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000143
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000442 | Grad Max: 0.001878
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000513 | Grad Max: 0.001555
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005402 | Grad Max: 0.005402
[GRADIENT NORM TOTAL] 6.1012

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.214
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500443   0.49955702] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.084
[MASKS] A(Pass/Fail): 723/1325 | B: 648/1400 | C: 593/1455
[LOSS Ex1] A: 0.63553 | B: 0.61224 | C: 0.61065
[LOGITS Ex2 A] Mean Abs: 2.141 | Max: 6.220
[LOSS Ex2] A: 0.10496 | B: 0.32453 | C: 0.23303
** [JOINT LOSS] ** : 0.840312
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006456 | Grad Max: 0.288697
  -> Layer: shared_layers.0.bias | Grad Mean: 0.753723 | Grad Max: 3.975960
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002023 | Grad Max: 0.005314
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003989 | Grad Max: 0.003989
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004695 | Grad Max: 0.930153
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.088261 | Grad Max: 5.173518
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000492 | Grad Max: 0.016125
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.046968 | Grad Max: 0.257391
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000662
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009289 | Grad Max: 0.019201
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000483
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002391 | Grad Max: 0.007437
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001149 | Grad Max: 0.003088
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033870 | Grad Max: 0.033870
[GRADIENT NORM TOTAL] 17.3096

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.855
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.73910874 0.2608913 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.085
[MASKS] A(Pass/Fail): 713/1335 | B: 657/1391 | C: 578/1470
[LOSS Ex1] A: 0.63068 | B: 0.60775 | C: 0.60654
[LOGITS Ex2 A] Mean Abs: 2.143 | Max: 6.146
[LOSS Ex2] A: 0.12715 | B: 0.32942 | C: 0.25085
** [JOINT LOSS] ** : 0.850793
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009078 | Grad Max: 0.432038
  -> Layer: shared_layers.0.bias | Grad Mean: 1.090011 | Grad Max: 5.749931
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.005999
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000855 | Grad Max: 0.000855
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006854 | Grad Max: 1.305509
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.128257 | Grad Max: 7.270782
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000713 | Grad Max: 0.026677
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.068203 | Grad Max: 0.364068
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.000943
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013608 | Grad Max: 0.027081
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000676
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003532 | Grad Max: 0.010933
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001684 | Grad Max: 0.003917
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050259 | Grad Max: 0.050259
[GRADIENT NORM TOTAL] 24.7002

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.989
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6405093  0.35949066] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 590/1026 | B: 614/1242 | C: 402/974
[LOSS Ex1] A: 0.62881 | B: 0.61190 | C: 0.60563
[LOGITS Ex2 A] Mean Abs: 2.181 | Max: 8.448
[LOSS Ex2] A: 0.10544 | B: 0.31132 | C: 0.21571
** [JOINT LOSS] ** : 0.826273
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007795 | Grad Max: 0.354998
  -> Layer: shared_layers.0.bias | Grad Mean: 0.816450 | Grad Max: 4.356618
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.005602
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001070 | Grad Max: 0.001070
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005328 | Grad Max: 0.980862
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.099555 | Grad Max: 5.460134
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000554 | Grad Max: 0.018176
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.052701 | Grad Max: 0.271809
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000719
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010635 | Grad Max: 0.021004
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000529
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002846 | Grad Max: 0.008001
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001349 | Grad Max: 0.003791
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041549 | Grad Max: 0.041549
[GRADIENT NORM TOTAL] 18.7100

[EPOCH SUMMARY] Train Loss: 0.8394

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8105 | Alpha: 0.5500
No improve count: 4/15

############################## EPOCH 163/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.214
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50822383 0.4917762 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 723/1325 | B: 660/1388 | C: 588/1460
[LOSS Ex1] A: 0.62957 | B: 0.61175 | C: 0.60804
[LOGITS Ex2 A] Mean Abs: 2.103 | Max: 8.605
[LOSS Ex2] A: 0.10339 | B: 0.31850 | C: 0.21870
** [JOINT LOSS] ** : 0.829980
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003092 | Grad Max: 0.108462
  -> Layer: shared_layers.0.bias | Grad Mean: 0.085631 | Grad Max: 0.545549
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.005461
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000071 | Grad Max: 0.000071
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000706 | Grad Max: 0.197091
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011666 | Grad Max: 1.071632
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.002595
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003179 | Grad Max: 0.033154
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000122
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000563 | Grad Max: 0.002605
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000090
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000152 | Grad Max: 0.000829
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000527 | Grad Max: 0.001527
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001079 | Grad Max: 0.001079
[GRADIENT NORM TOTAL] 2.4782

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.145
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062189 0.4937811] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 719/1329 | B: 648/1400 | C: 599/1449
[LOSS Ex1] A: 0.62624 | B: 0.61218 | C: 0.60649
[LOGITS Ex2 A] Mean Abs: 2.070 | Max: 6.261
[LOSS Ex2] A: 0.10856 | B: 0.33567 | C: 0.22991
** [JOINT LOSS] ** : 0.839680
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009117 | Grad Max: 0.278879
  -> Layer: shared_layers.0.bias | Grad Mean: 0.791593 | Grad Max: 3.542408
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.006195
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000721 | Grad Max: 0.000721
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005066 | Grad Max: 0.605913
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.095113 | Grad Max: 3.371073
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000568 | Grad Max: 0.017940
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.053961 | Grad Max: 0.272187
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000815
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010817 | Grad Max: 0.022202
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000559
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002803 | Grad Max: 0.008742
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001392 | Grad Max: 0.003529
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040354 | Grad Max: 0.040354
[GRADIENT NORM TOTAL] 16.8361

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.179
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50967306 0.49032697] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 722/1326 | B: 657/1391 | C: 617/1431
[LOSS Ex1] A: 0.62520 | B: 0.60768 | C: 0.60545
[LOGITS Ex2 A] Mean Abs: 2.016 | Max: 6.276
[LOSS Ex2] A: 0.12606 | B: 0.32711 | C: 0.21305
** [JOINT LOSS] ** : 0.834851
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007962 | Grad Max: 0.327408
  -> Layer: shared_layers.0.bias | Grad Mean: 0.898724 | Grad Max: 4.261260
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.006163
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002788 | Grad Max: 0.002788
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005681 | Grad Max: 0.987189
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.106845 | Grad Max: 5.440413
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000638 | Grad Max: 0.022411
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.060833 | Grad Max: 0.335769
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.000856
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011997 | Grad Max: 0.024231
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000641
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003111 | Grad Max: 0.010064
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001498 | Grad Max: 0.003917
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044219 | Grad Max: 0.044219
[GRADIENT NORM TOTAL] 19.8597

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.977
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50099266 0.49900737] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.084
[MASKS] A(Pass/Fail): 687/1361 | B: 614/1242 | C: 607/1441
[LOSS Ex1] A: 0.63415 | B: 0.61182 | C: 0.60657
[LOGITS Ex2 A] Mean Abs: 2.039 | Max: 5.242
[LOSS Ex2] A: 0.10581 | B: 0.31334 | C: 0.22078
** [JOINT LOSS] ** : 0.830821
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004589 | Grad Max: 0.210345
  -> Layer: shared_layers.0.bias | Grad Mean: 0.582710 | Grad Max: 2.773826
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.005260
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005593 | Grad Max: 0.005593
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003664 | Grad Max: 0.721975
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.068277 | Grad Max: 4.019463
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.015011
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039840 | Grad Max: 0.228691
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000618
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007916 | Grad Max: 0.016753
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000446
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002039 | Grad Max: 0.006702
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000990 | Grad Max: 0.003090
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028832 | Grad Max: 0.028832
[GRADIENT NORM TOTAL] 13.2518

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.873
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5426817 0.4573183] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.081
[MASKS] A(Pass/Fail): 691/1357 | B: 660/1388 | C: 591/1457
[LOSS Ex1] A: 0.63477 | B: 0.61167 | C: 0.60896
[LOGITS Ex2 A] Mean Abs: 2.094 | Max: 6.387
[LOSS Ex2] A: 0.11706 | B: 0.32283 | C: 0.21550
** [JOINT LOSS] ** : 0.836933
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005293 | Grad Max: 0.167962
  -> Layer: shared_layers.0.bias | Grad Mean: 0.332246 | Grad Max: 1.488940
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002050 | Grad Max: 0.005800
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004559 | Grad Max: 0.004559
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.253167
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.038781 | Grad Max: 1.423368
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000204 | Grad Max: 0.007066
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018910 | Grad Max: 0.100349
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000350
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003916 | Grad Max: 0.008856
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000239
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001041 | Grad Max: 0.003452
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000512 | Grad Max: 0.002069
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015049 | Grad Max: 0.015049
[GRADIENT NORM TOTAL] 7.1214

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.095
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.81180334 0.18819672] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.085
[MASKS] A(Pass/Fail): 751/1297 | B: 648/1400 | C: 590/1458
[LOSS Ex1] A: 0.62823 | B: 0.61209 | C: 0.60693
[LOGITS Ex2 A] Mean Abs: 2.148 | Max: 6.982
[LOSS Ex2] A: 0.12305 | B: 0.32984 | C: 0.22648
** [JOINT LOSS] ** : 0.842211
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010983 | Grad Max: 0.297347
  -> Layer: shared_layers.0.bias | Grad Mean: 0.708925 | Grad Max: 3.213648
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005753
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000158 | Grad Max: 0.000158
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004495 | Grad Max: 0.592490
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.082163 | Grad Max: 3.281419
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000477 | Grad Max: 0.013090
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.044499 | Grad Max: 0.212258
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000773
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009206 | Grad Max: 0.019725
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000441
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002401 | Grad Max: 0.007335
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001164 | Grad Max: 0.003109
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033295 | Grad Max: 0.033295
[GRADIENT NORM TOTAL] 14.9356

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.216
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004322  0.49956778] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.084
[MASKS] A(Pass/Fail): 723/1325 | B: 657/1391 | C: 598/1450
[LOSS Ex1] A: 0.63542 | B: 0.60760 | C: 0.60775
[LOGITS Ex2 A] Mean Abs: 2.164 | Max: 6.488
[LOSS Ex2] A: 0.10305 | B: 0.29598 | C: 0.22764
** [JOINT LOSS] ** : 0.825814
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006213 | Grad Max: 0.146569
  -> Layer: shared_layers.0.bias | Grad Mean: 0.409974 | Grad Max: 1.465619
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.005352
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003240 | Grad Max: 0.003240
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002789 | Grad Max: 0.381717
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051732 | Grad Max: 2.072227
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000303 | Grad Max: 0.008924
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028328 | Grad Max: 0.133202
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000445
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005753 | Grad Max: 0.011484
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000282
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001473 | Grad Max: 0.003963
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000721 | Grad Max: 0.002389
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020807 | Grad Max: 0.020807
[GRADIENT NORM TOTAL] 8.8663

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.857
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7396348  0.26036522] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.085
[MASKS] A(Pass/Fail): 713/1335 | B: 614/1242 | C: 604/1444
[LOSS Ex1] A: 0.63057 | B: 0.61174 | C: 0.60357
[LOGITS Ex2 A] Mean Abs: 2.076 | Max: 6.868
[LOSS Ex2] A: 0.11797 | B: 0.31056 | C: 0.21132
** [JOINT LOSS] ** : 0.828572
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005454 | Grad Max: 0.210866
  -> Layer: shared_layers.0.bias | Grad Mean: 0.512914 | Grad Max: 2.717791
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.006038
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003779 | Grad Max: 0.003779
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003221 | Grad Max: 0.616527
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059616 | Grad Max: 3.455055
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.010798
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033997 | Grad Max: 0.172608
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000504
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006814 | Grad Max: 0.013888
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000369
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001739 | Grad Max: 0.006008
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000877 | Grad Max: 0.002876
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024432 | Grad Max: 0.024432
[GRADIENT NORM TOTAL] 11.2735

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.992
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6407597  0.35924035] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.085
[MASKS] A(Pass/Fail): 590/1026 | B: 660/1388 | C: 606/1442
[LOSS Ex1] A: 0.62870 | B: 0.61159 | C: 0.60571
[LOGITS Ex2 A] Mean Abs: 2.116 | Max: 8.251
[LOSS Ex2] A: 0.11647 | B: 0.33116 | C: 0.22854
** [JOINT LOSS] ** : 0.840722
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008609 | Grad Max: 0.269212
  -> Layer: shared_layers.0.bias | Grad Mean: 0.757692 | Grad Max: 3.552843
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.006577
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014296 | Grad Max: 0.014296
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004798 | Grad Max: 0.666473
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.088930 | Grad Max: 3.708670
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000532 | Grad Max: 0.015986
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.050337 | Grad Max: 0.255185
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000805
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010319 | Grad Max: 0.020394
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000575
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002721 | Grad Max: 0.008481
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001454 | Grad Max: 0.003800
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040729 | Grad Max: 0.040729
[GRADIENT NORM TOTAL] 16.0836

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.217
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50818664 0.49181336] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 723/1325 | B: 648/1400 | C: 597/1451
[LOSS Ex1] A: 0.62944 | B: 0.61201 | C: 0.60593
[LOGITS Ex2 A] Mean Abs: 2.130 | Max: 7.792
[LOSS Ex2] A: 0.09141 | B: 0.31572 | C: 0.20937
** [JOINT LOSS] ** : 0.821296
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003488 | Grad Max: 0.105456
  -> Layer: shared_layers.0.bias | Grad Mean: 0.287392 | Grad Max: 1.253378
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002170 | Grad Max: 0.005803
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004014 | Grad Max: 0.004014
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001754 | Grad Max: 0.227105
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032039 | Grad Max: 1.265995
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000205 | Grad Max: 0.006285
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.019215 | Grad Max: 0.097884
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000425
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003878 | Grad Max: 0.008888
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000275
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000987 | Grad Max: 0.003751
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000534 | Grad Max: 0.002465
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013844 | Grad Max: 0.013844
[GRADIENT NORM TOTAL] 5.8009

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.147
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50615484 0.49384516] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.086
[MASKS] A(Pass/Fail): 719/1329 | B: 657/1391 | C: 607/1441
[LOSS Ex1] A: 0.62613 | B: 0.60751 | C: 0.60424
[LOGITS Ex2 A] Mean Abs: 2.192 | Max: 7.013
[LOSS Ex2] A: 0.10852 | B: 0.31383 | C: 0.23272
** [JOINT LOSS] ** : 0.830982
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007749 | Grad Max: 0.288793
  -> Layer: shared_layers.0.bias | Grad Mean: 0.738613 | Grad Max: 3.791044
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002308 | Grad Max: 0.006260
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006687 | Grad Max: 0.006687
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004769 | Grad Max: 0.762336
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.088966 | Grad Max: 4.264806
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000509 | Grad Max: 0.015776
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048532 | Grad Max: 0.252863
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000695
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009745 | Grad Max: 0.019478
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000490
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002566 | Grad Max: 0.007532
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001207 | Grad Max: 0.003204
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036072 | Grad Max: 0.036072
[GRADIENT NORM TOTAL] 16.4838

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.182
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50969577 0.4903042 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 722/1326 | B: 614/1242 | C: 593/1455
[LOSS Ex1] A: 0.62508 | B: 0.61164 | C: 0.60609
[LOGITS Ex2 A] Mean Abs: 2.174 | Max: 5.848
[LOSS Ex2] A: 0.12355 | B: 0.32305 | C: 0.24570
** [JOINT LOSS] ** : 0.845040
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010589 | Grad Max: 0.371576
  -> Layer: shared_layers.0.bias | Grad Mean: 1.006228 | Grad Max: 4.936773
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.006152
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001727 | Grad Max: 0.001727
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006392 | Grad Max: 1.132149
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.118940 | Grad Max: 6.294788
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000686 | Grad Max: 0.022460
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.065383 | Grad Max: 0.333602
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.001015
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013220 | Grad Max: 0.026630
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000688
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003450 | Grad Max: 0.010637
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001662 | Grad Max: 0.003735
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048349 | Grad Max: 0.048349
[GRADIENT NORM TOTAL] 22.2536

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.979
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009275  0.49907246] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.084
[MASKS] A(Pass/Fail): 689/1359 | B: 660/1388 | C: 606/1442
[LOSS Ex1] A: 0.63404 | B: 0.61150 | C: 0.60363
[LOGITS Ex2 A] Mean Abs: 2.139 | Max: 6.029
[LOSS Ex2] A: 0.10618 | B: 0.33382 | C: 0.22185
** [JOINT LOSS] ** : 0.837005
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007498 | Grad Max: 0.255804
  -> Layer: shared_layers.0.bias | Grad Mean: 0.724119 | Grad Max: 3.382624
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.005506
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002988 | Grad Max: 0.002988
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004612 | Grad Max: 0.722329
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.086044 | Grad Max: 4.039410
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000511 | Grad Max: 0.016484
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048915 | Grad Max: 0.258978
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000793
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009822 | Grad Max: 0.020555
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000517
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002569 | Grad Max: 0.007718
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001227 | Grad Max: 0.003417
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036018 | Grad Max: 0.036018
[GRADIENT NORM TOTAL] 15.8381

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.876
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54270864 0.4572914 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.081
[MASKS] A(Pass/Fail): 691/1357 | B: 648/1400 | C: 381/995
[LOSS Ex1] A: 0.63467 | B: 0.61192 | C: 0.61159
[LOGITS Ex2 A] Mean Abs: 2.085 | Max: 5.793
[LOSS Ex2] A: 0.11102 | B: 0.31265 | C: 0.24001
** [JOINT LOSS] ** : 0.840615
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003102 | Grad Max: 0.065737
  -> Layer: shared_layers.0.bias | Grad Mean: 0.103561 | Grad Max: 0.464813
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005308
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006595 | Grad Max: 0.006595
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000858 | Grad Max: 0.363393
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014831 | Grad Max: 2.028412
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.002746
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003045 | Grad Max: 0.028806
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000190
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000650 | Grad Max: 0.003309
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000085
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000160 | Grad Max: 0.000799
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000392 | Grad Max: 0.001208
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002053 | Grad Max: 0.002053
[GRADIENT NORM TOTAL] 3.6177

[EPOCH SUMMARY] Train Loss: 0.8346

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8185 | Alpha: 0.5500
No improve count: 5/15

############################## EPOCH 164/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.097
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8123671  0.18763298] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.085
[MASKS] A(Pass/Fail): 751/1297 | B: 657/1391 | C: 610/1438
[LOSS Ex1] A: 0.62812 | B: 0.60742 | C: 0.60701
[LOGITS Ex2 A] Mean Abs: 2.080 | Max: 7.881
[LOSS Ex2] A: 0.10182 | B: 0.31354 | C: 0.24158
** [JOINT LOSS] ** : 0.833163
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005939 | Grad Max: 0.196100
  -> Layer: shared_layers.0.bias | Grad Mean: 0.541875 | Grad Max: 2.480964
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002219 | Grad Max: 0.005724
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000862 | Grad Max: 0.000862
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003427 | Grad Max: 0.434681
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064313 | Grad Max: 2.456796
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.012342
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033730 | Grad Max: 0.192081
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000536
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006735 | Grad Max: 0.013325
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000373
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001795 | Grad Max: 0.005260
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000982 | Grad Max: 0.002832
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027346 | Grad Max: 0.027346
[GRADIENT NORM TOTAL] 11.4644

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.219
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004076  0.49959242] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.084
[MASKS] A(Pass/Fail): 723/1325 | B: 614/1242 | C: 625/1423
[LOSS Ex1] A: 0.63531 | B: 0.61155 | C: 0.60507
[LOGITS Ex2 A] Mean Abs: 2.104 | Max: 5.872
[LOSS Ex2] A: 0.09123 | B: 0.31035 | C: 0.20752
** [JOINT LOSS] ** : 0.820343
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005323 | Grad Max: 0.200011
  -> Layer: shared_layers.0.bias | Grad Mean: 0.539662 | Grad Max: 2.535846
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.005591
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002440 | Grad Max: 0.002440
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003586 | Grad Max: 0.606624
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.066864 | Grad Max: 3.382406
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000379 | Grad Max: 0.011949
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036435 | Grad Max: 0.203069
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000552
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007262 | Grad Max: 0.013996
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000411
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001894 | Grad Max: 0.005656
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000998 | Grad Max: 0.003137
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027735 | Grad Max: 0.027735
[GRADIENT NORM TOTAL] 12.2514

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.859
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.74004775 0.25995222] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.085
[MASKS] A(Pass/Fail): 713/1335 | B: 661/1387 | C: 589/1459
[LOSS Ex1] A: 0.63045 | B: 0.61142 | C: 0.60715
[LOGITS Ex2 A] Mean Abs: 2.116 | Max: 6.855
[LOSS Ex2] A: 0.11469 | B: 0.31607 | C: 0.21096
** [JOINT LOSS] ** : 0.830247
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003116 | Grad Max: 0.104676
  -> Layer: shared_layers.0.bias | Grad Mean: 0.114934 | Grad Max: 0.572253
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.005757
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001981 | Grad Max: 0.001981
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000808 | Grad Max: 0.451691
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013317 | Grad Max: 2.526882
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.003399
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002609 | Grad Max: 0.031904
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000176
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000555 | Grad Max: 0.003323
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000097
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000144 | Grad Max: 0.000924
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000433 | Grad Max: 0.001265
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001475 | Grad Max: 0.001475
[GRADIENT NORM TOTAL] 3.8835

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 0.994
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6408966  0.35910338] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 590/1026 | B: 648/1400 | C: 586/1462
[LOSS Ex1] A: 0.62858 | B: 0.61183 | C: 0.60767
[LOGITS Ex2 A] Mean Abs: 2.205 | Max: 8.710
[LOSS Ex2] A: 0.10143 | B: 0.31831 | C: 0.23265
** [JOINT LOSS] ** : 0.833485
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004222 | Grad Max: 0.160741
  -> Layer: shared_layers.0.bias | Grad Mean: 0.432268 | Grad Max: 2.188075
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002091 | Grad Max: 0.006044
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009738 | Grad Max: 0.009738
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002875 | Grad Max: 0.422913
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053138 | Grad Max: 2.353067
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000290 | Grad Max: 0.009289
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.027898 | Grad Max: 0.145468
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000447
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005663 | Grad Max: 0.012592
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000298
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001457 | Grad Max: 0.004542
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000656 | Grad Max: 0.002226
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019521 | Grad Max: 0.019521
[GRADIENT NORM TOTAL] 9.7784

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.220
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081849 0.4918151] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 723/1325 | B: 657/1391 | C: 587/1461
[LOSS Ex1] A: 0.62932 | B: 0.60732 | C: 0.61063
[LOGITS Ex2 A] Mean Abs: 2.167 | Max: 9.040
[LOSS Ex2] A: 0.10415 | B: 0.29953 | C: 0.23146
** [JOINT LOSS] ** : 0.827466
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006639 | Grad Max: 0.177953
  -> Layer: shared_layers.0.bias | Grad Mean: 0.488731 | Grad Max: 2.311571
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.005439
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002440 | Grad Max: 0.002440
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003244 | Grad Max: 0.558571
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059826 | Grad Max: 3.132290
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000337 | Grad Max: 0.010397
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031197 | Grad Max: 0.138081
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000529
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006376 | Grad Max: 0.013633
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000358
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001669 | Grad Max: 0.004777
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000822 | Grad Max: 0.002592
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023512 | Grad Max: 0.023512
[GRADIENT NORM TOTAL] 10.9342

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.151
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.506101   0.49389896] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 719/1329 | B: 614/1242 | C: 597/1451
[LOSS Ex1] A: 0.62600 | B: 0.61145 | C: 0.60648
[LOGITS Ex2 A] Mean Abs: 2.153 | Max: 5.848
[LOSS Ex2] A: 0.10687 | B: 0.29700 | C: 0.22104
** [JOINT LOSS] ** : 0.822945
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003491 | Grad Max: 0.091703
  -> Layer: shared_layers.0.bias | Grad Mean: 0.255295 | Grad Max: 1.259772
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.006156
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002666 | Grad Max: 0.002666
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001577 | Grad Max: 0.567586
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028183 | Grad Max: 3.155018
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.004957
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013779 | Grad Max: 0.077487
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000283
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002751 | Grad Max: 0.007241
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000219
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000676 | Grad Max: 0.002857
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000402 | Grad Max: 0.001828
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009030 | Grad Max: 0.009030
[GRADIENT NORM TOTAL] 6.2643

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.185
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5097777  0.49022225] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 722/1326 | B: 661/1387 | C: 588/1460
[LOSS Ex1] A: 0.62495 | B: 0.61131 | C: 0.60714
[LOGITS Ex2 A] Mean Abs: 2.121 | Max: 5.638
[LOSS Ex2] A: 0.11013 | B: 0.31626 | C: 0.21055
** [JOINT LOSS] ** : 0.826784
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003192 | Grad Max: 0.160723
  -> Layer: shared_layers.0.bias | Grad Mean: 0.384230 | Grad Max: 2.078230
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.005705
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005093 | Grad Max: 0.005093
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002293 | Grad Max: 0.767687
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042531 | Grad Max: 4.236332
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.008640
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021094 | Grad Max: 0.111759
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000337
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004200 | Grad Max: 0.008919
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000271
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001095 | Grad Max: 0.004049
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000545 | Grad Max: 0.002482
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015428 | Grad Max: 0.015428
[GRADIENT NORM TOTAL] 9.2137

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.982
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009243  0.49907577] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.084
[MASKS] A(Pass/Fail): 689/1359 | B: 648/1400 | C: 606/1442
[LOSS Ex1] A: 0.63390 | B: 0.61172 | C: 0.60378
[LOGITS Ex2 A] Mean Abs: 2.124 | Max: 5.912
[LOSS Ex2] A: 0.10631 | B: 0.32399 | C: 0.22463
** [JOINT LOSS] ** : 0.834775
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003870 | Grad Max: 0.137155
  -> Layer: shared_layers.0.bias | Grad Mean: 0.109853 | Grad Max: 0.679223
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.005303
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006940 | Grad Max: 0.006940
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001044 | Grad Max: 0.213301
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017558 | Grad Max: 1.187287
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.002887
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004472 | Grad Max: 0.032090
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000223
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001105 | Grad Max: 0.003833
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000113
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000301 | Grad Max: 0.001241
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000379 | Grad Max: 0.001294
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005071 | Grad Max: 0.005071
[GRADIENT NORM TOTAL] 3.4923

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.878
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54262245 0.45737758] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.081
[MASKS] A(Pass/Fail): 691/1357 | B: 657/1391 | C: 648/1400
[LOSS Ex1] A: 0.63453 | B: 0.60721 | C: 0.59964
[LOGITS Ex2 A] Mean Abs: 2.136 | Max: 5.536
[LOSS Ex2] A: 0.10335 | B: 0.29958 | C: 0.20627
** [JOINT LOSS] ** : 0.816861
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003889 | Grad Max: 0.196906
  -> Layer: shared_layers.0.bias | Grad Mean: 0.416627 | Grad Max: 2.477912
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006014
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009775 | Grad Max: 0.009775
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002602 | Grad Max: 0.546973
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048364 | Grad Max: 3.037910
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000243 | Grad Max: 0.010081
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023349 | Grad Max: 0.137830
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000390
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004617 | Grad Max: 0.009743
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000267
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001230 | Grad Max: 0.003704
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000598 | Grad Max: 0.002144
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018018 | Grad Max: 0.018018
[GRADIENT NORM TOTAL] 9.7983

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.81294996 0.18705004] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.086
[MASKS] A(Pass/Fail): 751/1297 | B: 614/1242 | C: 592/1456
[LOSS Ex1] A: 0.62798 | B: 0.61132 | C: 0.60644
[LOGITS Ex2 A] Mean Abs: 2.159 | Max: 6.206
[LOSS Ex2] A: 0.10404 | B: 0.29987 | C: 0.21948
** [JOINT LOSS] ** : 0.823042
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004477 | Grad Max: 0.195419
  -> Layer: shared_layers.0.bias | Grad Mean: 0.376711 | Grad Max: 1.957045
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002277 | Grad Max: 0.006277
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011375 | Grad Max: 0.011375
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002442 | Grad Max: 0.611973
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.044797 | Grad Max: 3.405525
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000241 | Grad Max: 0.007747
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023025 | Grad Max: 0.122786
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000373
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004721 | Grad Max: 0.010186
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000261
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001262 | Grad Max: 0.003750
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000595 | Grad Max: 0.002178
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018122 | Grad Max: 0.018122
[GRADIENT NORM TOTAL] 9.0116

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.223
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004489 0.4995511] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.084
[MASKS] A(Pass/Fail): 723/1325 | B: 661/1387 | C: 610/1438
[LOSS Ex1] A: 0.63516 | B: 0.61119 | C: 0.60167
[LOGITS Ex2 A] Mean Abs: 2.151 | Max: 5.845
[LOSS Ex2] A: 0.09596 | B: 0.32191 | C: 0.20621
** [JOINT LOSS] ** : 0.824040
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002653 | Grad Max: 0.119945
  -> Layer: shared_layers.0.bias | Grad Mean: 0.278064 | Grad Max: 1.668692
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.005622
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000544 | Grad Max: 0.000544
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001660 | Grad Max: 0.508190
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030264 | Grad Max: 2.770454
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000123 | Grad Max: 0.006061
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011662 | Grad Max: 0.082521
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000201
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002208 | Grad Max: 0.006020
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000138
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000572 | Grad Max: 0.001858
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000424 | Grad Max: 0.001622
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007997 | Grad Max: 0.007997
[GRADIENT NORM TOTAL] 6.9917

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.862
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7404907 0.2595093] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.085
[MASKS] A(Pass/Fail): 713/1335 | B: 648/1400 | C: 596/1452
[LOSS Ex1] A: 0.63029 | B: 0.61159 | C: 0.60664
[LOGITS Ex2 A] Mean Abs: 2.123 | Max: 7.062
[LOSS Ex2] A: 0.11611 | B: 0.32569 | C: 0.20762
** [JOINT LOSS] ** : 0.832646
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005154 | Grad Max: 0.157545
  -> Layer: shared_layers.0.bias | Grad Mean: 0.412381 | Grad Max: 2.232842
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005735
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005529 | Grad Max: 0.005529
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002431 | Grad Max: 0.612842
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045444 | Grad Max: 3.439521
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000252 | Grad Max: 0.009006
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023758 | Grad Max: 0.131495
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000374
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004610 | Grad Max: 0.009599
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000249
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001186 | Grad Max: 0.003393
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000657 | Grad Max: 0.002331
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018102 | Grad Max: 0.018102
[GRADIENT NORM TOTAL] 9.3774

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 0.998
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6410931 0.3589069] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.086
[MASKS] A(Pass/Fail): 591/1025 | B: 658/1390 | C: 618/1430
[LOSS Ex1] A: 0.62841 | B: 0.60708 | C: 0.60450
[LOGITS Ex2 A] Mean Abs: 2.197 | Max: 9.500
[LOSS Ex2] A: 0.10225 | B: 0.29914 | C: 0.21967
** [JOINT LOSS] ** : 0.820346
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002482 | Grad Max: 0.078259
  -> Layer: shared_layers.0.bias | Grad Mean: 0.108025 | Grad Max: 0.755609
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.005642
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006387 | Grad Max: 0.006387
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000899 | Grad Max: 0.365488
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015455 | Grad Max: 2.045302
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.001968
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002543 | Grad Max: 0.030703
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000130
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000512 | Grad Max: 0.002721
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000092
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000136 | Grad Max: 0.000789
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000391 | Grad Max: 0.001049
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001172 | Grad Max: 0.001172
[GRADIENT NORM TOTAL] 4.0954

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.225
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.508233 0.491767] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 723/1325 | B: 614/1242 | C: 373/1003
[LOSS Ex1] A: 0.62914 | B: 0.61119 | C: 0.61315
[LOGITS Ex2 A] Mean Abs: 2.190 | Max: 8.465
[LOSS Ex2] A: 0.09638 | B: 0.29462 | C: 0.21287
** [JOINT LOSS] ** : 0.819123
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004018 | Grad Max: 0.132180
  -> Layer: shared_layers.0.bias | Grad Mean: 0.275972 | Grad Max: 1.311898
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.005553
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006493 | Grad Max: 0.006493
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001885 | Grad Max: 0.375996
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033779 | Grad Max: 2.090791
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000185 | Grad Max: 0.006353
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016989 | Grad Max: 0.096069
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000321
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003586 | Grad Max: 0.007774
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000199
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000970 | Grad Max: 0.002857
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000514 | Grad Max: 0.001831
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013974 | Grad Max: 0.013974
[GRADIENT NORM TOTAL] 6.4419

[EPOCH SUMMARY] Train Loss: 0.8261

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8088 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8105 -> New: 0.8088)

############################## EPOCH 165/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.155
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059911  0.49400893] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 719/1329 | B: 661/1387 | C: 593/1455
[LOSS Ex1] A: 0.62582 | B: 0.61106 | C: 0.60804
[LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.448
[LOSS Ex2] A: 0.10661 | B: 0.31160 | C: 0.20058
** [JOINT LOSS] ** : 0.821242
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001663 | Grad Max: 0.030680
  -> Layer: shared_layers.0.bias | Grad Mean: 0.087666 | Grad Max: 0.548111
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.006348
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000653 | Grad Max: 0.000653
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000745 | Grad Max: 0.128430
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013062 | Grad Max: 0.685926
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.002718
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003927 | Grad Max: 0.026617
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000164
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000722 | Grad Max: 0.003629
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000090
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000178 | Grad Max: 0.001116
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000439 | Grad Max: 0.001337
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002505 | Grad Max: 0.002505
[GRADIENT NORM TOTAL] 2.5043

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.190
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5098793 0.4901207] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 722/1326 | B: 648/1400 | C: 593/1455
[LOSS Ex1] A: 0.62477 | B: 0.61145 | C: 0.60282
[LOGITS Ex2 A] Mean Abs: 2.169 | Max: 7.266
[LOSS Ex2] A: 0.11953 | B: 0.31244 | C: 0.22823
** [JOINT LOSS] ** : 0.833081
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002209 | Grad Max: 0.054849
  -> Layer: shared_layers.0.bias | Grad Mean: 0.120947 | Grad Max: 0.519657
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.005837
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004492 | Grad Max: 0.004492
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001041 | Grad Max: 0.423611
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018517 | Grad Max: 2.340371
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.003416
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005073 | Grad Max: 0.044171
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000194
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000860 | Grad Max: 0.003761
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000091
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000167 | Grad Max: 0.000909
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000339 | Grad Max: 0.000914
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000033 | Grad Max: 0.000033
[GRADIENT NORM TOTAL] 4.4611

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.986
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009221  0.49907792] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.084
[MASKS] A(Pass/Fail): 689/1359 | B: 658/1390 | C: 618/1430
[LOSS Ex1] A: 0.63371 | B: 0.60693 | C: 0.60229
[LOGITS Ex2 A] Mean Abs: 2.155 | Max: 6.698
[LOSS Ex2] A: 0.10536 | B: 0.29674 | C: 0.22596
** [JOINT LOSS] ** : 0.823662
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004032 | Grad Max: 0.139623
  -> Layer: shared_layers.0.bias | Grad Mean: 0.176088 | Grad Max: 0.609772
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.005440
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004941 | Grad Max: 0.004941
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001269 | Grad Max: 0.614647
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022270 | Grad Max: 3.388187
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000110 | Grad Max: 0.003476
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009550 | Grad Max: 0.046395
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000286
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002087 | Grad Max: 0.005838
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000161
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000544 | Grad Max: 0.001933
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000432 | Grad Max: 0.001927
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008619 | Grad Max: 0.008619
[GRADIENT NORM TOTAL] 5.4920

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.881
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54251957 0.45748043] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.081
[MASKS] A(Pass/Fail): 691/1357 | B: 614/1242 | C: 603/1445
[LOSS Ex1] A: 0.63434 | B: 0.61104 | C: 0.60567
[LOGITS Ex2 A] Mean Abs: 2.150 | Max: 6.207
[LOSS Ex2] A: 0.11119 | B: 0.29489 | C: 0.21132
** [JOINT LOSS] ** : 0.822817
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002989 | Grad Max: 0.077710
  -> Layer: shared_layers.0.bias | Grad Mean: 0.126941 | Grad Max: 0.688593
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.005312
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006353 | Grad Max: 0.006353
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000867 | Grad Max: 0.217440
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014729 | Grad Max: 1.176293
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002126
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002155 | Grad Max: 0.018681
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000158
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000406 | Grad Max: 0.002267
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000085
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000113 | Grad Max: 0.000933
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000443 | Grad Max: 0.001281
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000653 | Grad Max: 0.000653
[GRADIENT NORM TOTAL] 3.5164

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.106
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.81388503 0.18611501] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.086
[MASKS] A(Pass/Fail): 751/1297 | B: 661/1387 | C: 635/1413
[LOSS Ex1] A: 0.62777 | B: 0.61091 | C: 0.60057
[LOGITS Ex2 A] Mean Abs: 2.190 | Max: 7.238
[LOSS Ex2] A: 0.09986 | B: 0.31861 | C: 0.23859
** [JOINT LOSS] ** : 0.832104
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003758 | Grad Max: 0.113221
  -> Layer: shared_layers.0.bias | Grad Mean: 0.196023 | Grad Max: 1.164450
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002214 | Grad Max: 0.006358
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004428 | Grad Max: 0.004428
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001134 | Grad Max: 0.378015
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020246 | Grad Max: 2.113435
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000092 | Grad Max: 0.004500
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008086 | Grad Max: 0.065995
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000204
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001527 | Grad Max: 0.004676
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000141
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000334 | Grad Max: 0.001639
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000275 | Grad Max: 0.000859
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003065 | Grad Max: 0.003065
[GRADIENT NORM TOTAL] 4.5944

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.228
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50052    0.49948004] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.084
[MASKS] A(Pass/Fail): 723/1325 | B: 648/1400 | C: 607/1441
[LOSS Ex1] A: 0.63495 | B: 0.61128 | C: 0.61073
[LOGITS Ex2 A] Mean Abs: 2.237 | Max: 6.119
[LOSS Ex2] A: 0.09944 | B: 0.31433 | C: 0.22222
** [JOINT LOSS] ** : 0.830988
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002540 | Grad Max: 0.094411
  -> Layer: shared_layers.0.bias | Grad Mean: 0.196681 | Grad Max: 1.089365
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.005347
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006789 | Grad Max: 0.006789
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001192 | Grad Max: 0.498563
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020954 | Grad Max: 2.734081
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.003104
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006783 | Grad Max: 0.044729
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000182
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001473 | Grad Max: 0.004311
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000107
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000430 | Grad Max: 0.001520
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000356 | Grad Max: 0.001617
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007778 | Grad Max: 0.007778
[GRADIENT NORM TOTAL] 5.6796

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 0.866
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.74126816 0.2587318 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.086
[MASKS] A(Pass/Fail): 713/1335 | B: 658/1390 | C: 590/1458
[LOSS Ex1] A: 0.63005 | B: 0.60676 | C: 0.60476
[LOGITS Ex2 A] Mean Abs: 2.207 | Max: 6.699
[LOSS Ex2] A: 0.11531 | B: 0.29406 | C: 0.20300
** [JOINT LOSS] ** : 0.817983
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002566 | Grad Max: 0.061146
  -> Layer: shared_layers.0.bias | Grad Mean: 0.137711 | Grad Max: 0.693674
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.005569
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005672 | Grad Max: 0.005672
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000976 | Grad Max: 0.493469
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017139 | Grad Max: 2.751652
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002855
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002728 | Grad Max: 0.033795
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000128
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000455 | Grad Max: 0.002350
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000072
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000122 | Grad Max: 0.000798
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000446 | Grad Max: 0.001290
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000937 | Grad Max: 0.000937
[GRADIENT NORM TOTAL] 5.1120

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.003
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64144945 0.35855052] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.086
[MASKS] A(Pass/Fail): 591/1025 | B: 614/1242 | C: 592/1456
[LOSS Ex1] A: 0.62816 | B: 0.61086 | C: 0.60626
[LOGITS Ex2 A] Mean Abs: 2.257 | Max: 10.279
[LOSS Ex2] A: 0.10516 | B: 0.29403 | C: 0.19966
** [JOINT LOSS] ** : 0.814709
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004052 | Grad Max: 0.124344
  -> Layer: shared_layers.0.bias | Grad Mean: 0.135013 | Grad Max: 0.528847
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.005800
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002205 | Grad Max: 0.002205
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000987 | Grad Max: 0.401041
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016750 | Grad Max: 2.247566
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000053 | Grad Max: 0.002520
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003148 | Grad Max: 0.024058
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000187
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000726 | Grad Max: 0.002902
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000099
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000221 | Grad Max: 0.001287
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000453 | Grad Max: 0.001476
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003309 | Grad Max: 0.003309
[GRADIENT NORM TOTAL] 4.2714

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.231
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082675 0.4917325] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.087
[MASKS] A(Pass/Fail): 724/1324 | B: 667/1381 | C: 585/1463
[LOSS Ex1] A: 0.62890 | B: 0.61073 | C: 0.60806
[LOGITS Ex2 A] Mean Abs: 2.253 | Max: 8.674
[LOSS Ex2] A: 0.08996 | B: 0.32067 | C: 0.19978
** [JOINT LOSS] ** : 0.819369
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004181 | Grad Max: 0.161981
  -> Layer: shared_layers.0.bias | Grad Mean: 0.097453 | Grad Max: 0.946957
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.006205
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004879 | Grad Max: 0.004879
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000845 | Grad Max: 0.170877
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013721 | Grad Max: 0.952561
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.002498
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003853 | Grad Max: 0.022960
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000225
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000944 | Grad Max: 0.003488
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000146
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000264 | Grad Max: 0.001495
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000429 | Grad Max: 0.001504
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003918 | Grad Max: 0.003918
[GRADIENT NORM TOTAL] 2.7431

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.160
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50583863 0.49416137] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 719/1329 | B: 649/1399 | C: 559/1489
[LOSS Ex1] A: 0.62557 | B: 0.61110 | C: 0.61389
[LOGITS Ex2 A] Mean Abs: 2.240 | Max: 6.121
[LOSS Ex2] A: 0.10323 | B: 0.30822 | C: 0.21014
** [JOINT LOSS] ** : 0.824044
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003396 | Grad Max: 0.099855
  -> Layer: shared_layers.0.bias | Grad Mean: 0.153062 | Grad Max: 0.917433
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.006200
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005882 | Grad Max: 0.005882
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001061 | Grad Max: 0.211329
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018994 | Grad Max: 1.178789
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.004206
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010012 | Grad Max: 0.055708
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000241
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002065 | Grad Max: 0.005538
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000186
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000539 | Grad Max: 0.002125
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000373 | Grad Max: 0.001813
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007985 | Grad Max: 0.007985
[GRADIENT NORM TOTAL] 3.4590

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.196
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5099835 0.4900165] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.565 | Std: 0.086
[MASKS] A(Pass/Fail): 722/1326 | B: 659/1389 | C: 601/1447
[LOSS Ex1] A: 0.62451 | B: 0.60656 | C: 0.60327
[LOGITS Ex2 A] Mean Abs: 2.247 | Max: 9.036
[LOSS Ex2] A: 0.10867 | B: 0.29261 | C: 0.22687
** [JOINT LOSS] ** : 0.820829
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003568 | Grad Max: 0.114802
  -> Layer: shared_layers.0.bias | Grad Mean: 0.240379 | Grad Max: 1.571618
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.005788
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002887 | Grad Max: 0.002887
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001576 | Grad Max: 0.354677
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028293 | Grad Max: 1.982871
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000104 | Grad Max: 0.006943
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009709 | Grad Max: 0.087265
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000195
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001715 | Grad Max: 0.005368
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000118
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000403 | Grad Max: 0.001337
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000344 | Grad Max: 0.001025
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004523 | Grad Max: 0.004523
[GRADIENT NORM TOTAL] 6.0692

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.991
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500871 0.499129] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.562 | Std: 0.085
[MASKS] A(Pass/Fail): 690/1358 | B: 617/1239 | C: 617/1431
[LOSS Ex1] A: 0.63345 | B: 0.61064 | C: 0.60666
[LOGITS Ex2 A] Mean Abs: 2.208 | Max: 5.731
[LOSS Ex2] A: 0.11000 | B: 0.29226 | C: 0.20546
** [JOINT LOSS] ** : 0.819494
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006023 | Grad Max: 0.245768
  -> Layer: shared_layers.0.bias | Grad Mean: 0.188552 | Grad Max: 0.862962
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.005354
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005945 | Grad Max: 0.005945
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001436 | Grad Max: 0.356433
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023326 | Grad Max: 1.967425
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.003210
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003635 | Grad Max: 0.044752
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000281
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000393 | Grad Max: 0.002702
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000086
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000129 | Grad Max: 0.000751
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000401 | Grad Max: 0.001216
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001743 | Grad Max: 0.001743
[GRADIENT NORM TOTAL] 4.8925

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.886
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5424239 0.4575761] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.081
[MASKS] A(Pass/Fail): 692/1356 | B: 667/1381 | C: 641/1407
[LOSS Ex1] A: 0.63409 | B: 0.61053 | C: 0.60606
[LOGITS Ex2 A] Mean Abs: 2.207 | Max: 6.458
[LOSS Ex2] A: 0.10842 | B: 0.30814 | C: 0.22043
** [JOINT LOSS] ** : 0.829224
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004294 | Grad Max: 0.124990
  -> Layer: shared_layers.0.bias | Grad Mean: 0.164925 | Grad Max: 0.779219
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.005614
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009825 | Grad Max: 0.009825
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001083 | Grad Max: 0.460747
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018740 | Grad Max: 2.542868
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000067 | Grad Max: 0.003414
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005069 | Grad Max: 0.043743
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000223
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001203 | Grad Max: 0.004096
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000099
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000336 | Grad Max: 0.001224
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000310 | Grad Max: 0.001475
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005598 | Grad Max: 0.005598
[GRADIENT NORM TOTAL] 4.6616

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.112
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.81513786 0.18486212] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.086
[MASKS] A(Pass/Fail): 751/1297 | B: 650/1398 | C: 461/915
[LOSS Ex1] A: 0.62751 | B: 0.61089 | C: 0.59567
[LOGITS Ex2 A] Mean Abs: 2.261 | Max: 7.345
[LOSS Ex2] A: 0.09802 | B: 0.31014 | C: 0.23002
** [JOINT LOSS] ** : 0.824085
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003385 | Grad Max: 0.085529
  -> Layer: shared_layers.0.bias | Grad Mean: 0.164700 | Grad Max: 0.604367
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.006159
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007307 | Grad Max: 0.007307
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001182 | Grad Max: 0.388813
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020645 | Grad Max: 2.190099
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.003573
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006794 | Grad Max: 0.041048
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000244
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001414 | Grad Max: 0.005001
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000140
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000319 | Grad Max: 0.001673
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000270 | Grad Max: 0.001066
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002703 | Grad Max: 0.002703
[GRADIENT NORM TOTAL] 4.4975

[EPOCH SUMMARY] Train Loss: 0.8238

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8064 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8088 -> New: 0.8064)

############################## EPOCH 166/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.235
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50052404 0.49947596] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 722/1326 | B: 660/1388 | C: 603/1445
[LOSS Ex1] A: 0.63470 | B: 0.60636 | C: 0.60262
[LOGITS Ex2 A] Mean Abs: 2.276 | Max: 6.837
[LOSS Ex2] A: 0.08856 | B: 0.29370 | C: 0.20193
** [JOINT LOSS] ** : 0.809292
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003196 | Grad Max: 0.080145
  -> Layer: shared_layers.0.bias | Grad Mean: 0.149166 | Grad Max: 0.726276
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.005754
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002117 | Grad Max: 0.002117
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000920 | Grad Max: 0.400234
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015802 | Grad Max: 2.241073
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.003383
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003900 | Grad Max: 0.042421
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000113
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000695 | Grad Max: 0.003391
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000115
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000182 | Grad Max: 0.001434
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000414 | Grad Max: 0.001377
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001949 | Grad Max: 0.001949
[GRADIENT NORM TOTAL] 4.1309

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 0.871
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7422363  0.25776365] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.086
[MASKS] A(Pass/Fail): 713/1335 | B: 617/1239 | C: 628/1420
[LOSS Ex1] A: 0.62978 | B: 0.61044 | C: 0.60329
[LOGITS Ex2 A] Mean Abs: 2.267 | Max: 6.506
[LOSS Ex2] A: 0.11356 | B: 0.28953 | C: 0.18257
** [JOINT LOSS] ** : 0.809723
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003570 | Grad Max: 0.126265
  -> Layer: shared_layers.0.bias | Grad Mean: 0.188555 | Grad Max: 0.960342
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.005738
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002307 | Grad Max: 0.002307
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001163 | Grad Max: 0.431720
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020023 | Grad Max: 2.395000
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.003648
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006411 | Grad Max: 0.052483
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000146
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000999 | Grad Max: 0.003688
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000093
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000225 | Grad Max: 0.000993
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000408 | Grad Max: 0.001074
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001929 | Grad Max: 0.001929
[GRADIENT NORM TOTAL] 5.0766

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.009
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64186555 0.35813445] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.086
[MASKS] A(Pass/Fail): 591/1025 | B: 667/1381 | C: 638/1410
[LOSS Ex1] A: 0.62788 | B: 0.61032 | C: 0.60847
[LOGITS Ex2 A] Mean Abs: 2.330 | Max: 10.246
[LOSS Ex2] A: 0.10159 | B: 0.31076 | C: 0.23924
** [JOINT LOSS] ** : 0.832757
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004232 | Grad Max: 0.106788
  -> Layer: shared_layers.0.bias | Grad Mean: 0.156348 | Grad Max: 1.288900
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005834
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007275 | Grad Max: 0.007275
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001334 | Grad Max: 0.465160
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023472 | Grad Max: 2.560962
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.003513
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009273 | Grad Max: 0.050557
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000296
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001851 | Grad Max: 0.005884
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000189
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000393 | Grad Max: 0.002120
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000194 | Grad Max: 0.000780
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002974 | Grad Max: 0.002974
[GRADIENT NORM TOTAL] 5.2305

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.237
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082321  0.49176785] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.087
[MASKS] A(Pass/Fail): 724/1324 | B: 650/1398 | C: 603/1445
[LOSS Ex1] A: 0.62862 | B: 0.61068 | C: 0.60568
[LOGITS Ex2 A] Mean Abs: 2.330 | Max: 8.299
[LOSS Ex2] A: 0.09437 | B: 0.30696 | C: 0.21557
** [JOINT LOSS] ** : 0.820628
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003253 | Grad Max: 0.122114
  -> Layer: shared_layers.0.bias | Grad Mean: 0.163416 | Grad Max: 0.596247
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.005743
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002257 | Grad Max: 0.002257
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001319 | Grad Max: 0.425152
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022800 | Grad Max: 2.332852
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.003411
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006650 | Grad Max: 0.031648
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000259
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001465 | Grad Max: 0.004780
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000141
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000322 | Grad Max: 0.001479
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000349 | Grad Max: 0.001201
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002855 | Grad Max: 0.002855
[GRADIENT NORM TOTAL] 5.3140

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.166
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50573266 0.4942673 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 719/1329 | B: 661/1387 | C: 638/1410
[LOSS Ex1] A: 0.62528 | B: 0.60615 | C: 0.60623
[LOGITS Ex2 A] Mean Abs: 2.302 | Max: 5.833
[LOSS Ex2] A: 0.09138 | B: 0.29204 | C: 0.21262
** [JOINT LOSS] ** : 0.811231
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003466 | Grad Max: 0.104421
  -> Layer: shared_layers.0.bias | Grad Mean: 0.159389 | Grad Max: 0.733474
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002261 | Grad Max: 0.006213
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006149 | Grad Max: 0.006149
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001271 | Grad Max: 0.145838
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022982 | Grad Max: 0.801062
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.005980
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011383 | Grad Max: 0.078338
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000245
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002268 | Grad Max: 0.005392
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000146
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000581 | Grad Max: 0.001971
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000362 | Grad Max: 0.001643
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008705 | Grad Max: 0.008705
[GRADIENT NORM TOTAL] 3.8827

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.202
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.510097 0.489903] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 722/1326 | B: 619/1237 | C: 597/1451
[LOSS Ex1] A: 0.62423 | B: 0.61022 | C: 0.60750
[LOGITS Ex2 A] Mean Abs: 2.288 | Max: 6.635
[LOSS Ex2] A: 0.10620 | B: 0.29568 | C: 0.21115
** [JOINT LOSS] ** : 0.818322
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002518 | Grad Max: 0.062965
  -> Layer: shared_layers.0.bias | Grad Mean: 0.140441 | Grad Max: 0.716237
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.006329
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002549 | Grad Max: 0.002549
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000802 | Grad Max: 0.229499
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013924 | Grad Max: 1.292447
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.002839
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002838 | Grad Max: 0.031494
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000137
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000472 | Grad Max: 0.002989
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000089
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000110 | Grad Max: 0.000708
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000343 | Grad Max: 0.001028
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000111 | Grad Max: 0.000111
[GRADIENT NORM TOTAL] 3.4228

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.996
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50081146 0.4991885 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 690/1358 | B: 667/1381 | C: 611/1437
[LOSS Ex1] A: 0.63318 | B: 0.61011 | C: 0.59801
[LOGITS Ex2 A] Mean Abs: 2.267 | Max: 6.304
[LOSS Ex2] A: 0.09299 | B: 0.30658 | C: 0.21276
** [JOINT LOSS] ** : 0.817877
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003488 | Grad Max: 0.149578
  -> Layer: shared_layers.0.bias | Grad Mean: 0.128225 | Grad Max: 0.474835
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002143 | Grad Max: 0.005877
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007688 | Grad Max: 0.007688
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000947 | Grad Max: 0.616392
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015572 | Grad Max: 3.401599
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.002607
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002239 | Grad Max: 0.022307
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000172
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000416 | Grad Max: 0.002571
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000076
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000115 | Grad Max: 0.000699
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000384 | Grad Max: 0.001333
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002485 | Grad Max: 0.002485
[GRADIENT NORM TOTAL] 5.0592

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.891
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5423557 0.4576443] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.082
[MASKS] A(Pass/Fail): 692/1356 | B: 650/1398 | C: 571/1477
[LOSS Ex1] A: 0.63383 | B: 0.61046 | C: 0.60643
[LOGITS Ex2 A] Mean Abs: 2.258 | Max: 6.167
[LOSS Ex2] A: 0.10415 | B: 0.30384 | C: 0.20569
** [JOINT LOSS] ** : 0.821467
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003826 | Grad Max: 0.117314
  -> Layer: shared_layers.0.bias | Grad Mean: 0.207238 | Grad Max: 0.881619
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.005518
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007365 | Grad Max: 0.007365
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001556 | Grad Max: 0.543941
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027432 | Grad Max: 3.009562
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000095 | Grad Max: 0.003778
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008320 | Grad Max: 0.055575
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000179
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001394 | Grad Max: 0.004420
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000094
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000348 | Grad Max: 0.001239
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000382 | Grad Max: 0.001167
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004152 | Grad Max: 0.004152
[GRADIENT NORM TOTAL] 6.0043

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.117
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8164772 0.1835228] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.086
[MASKS] A(Pass/Fail): 751/1297 | B: 661/1387 | C: 634/1414
[LOSS Ex1] A: 0.62723 | B: 0.60593 | C: 0.60501
[LOGITS Ex2 A] Mean Abs: 2.311 | Max: 6.671
[LOSS Ex2] A: 0.09455 | B: 0.28826 | C: 0.22662
** [JOINT LOSS] ** : 0.815868
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002375 | Grad Max: 0.053702
  -> Layer: shared_layers.0.bias | Grad Mean: 0.114948 | Grad Max: 0.775124
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.005324
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003395 | Grad Max: 0.003395
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001059 | Grad Max: 0.336733
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019067 | Grad Max: 1.865715
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.003839
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007484 | Grad Max: 0.044482
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000190
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001376 | Grad Max: 0.005116
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000117
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000292 | Grad Max: 0.001440
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000320 | Grad Max: 0.001007
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003075 | Grad Max: 0.003075
[GRADIENT NORM TOTAL] 4.0564

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.241
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50051236 0.4994876 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 722/1326 | B: 619/1237 | C: 633/1415
[LOSS Ex1] A: 0.63444 | B: 0.60999 | C: 0.60644
[LOGITS Ex2 A] Mean Abs: 2.289 | Max: 7.537
[LOSS Ex2] A: 0.09078 | B: 0.30284 | C: 0.21090
** [JOINT LOSS] ** : 0.818463
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006093 | Grad Max: 0.184705
  -> Layer: shared_layers.0.bias | Grad Mean: 0.557106 | Grad Max: 2.530241
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.004872
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002925 | Grad Max: 0.002925
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003391 | Grad Max: 0.560397
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.062181 | Grad Max: 3.151563
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.011298
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034354 | Grad Max: 0.166618
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000568
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006762 | Grad Max: 0.015141
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000362
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001628 | Grad Max: 0.005054
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000861 | Grad Max: 0.002642
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023196 | Grad Max: 0.023196
[GRADIENT NORM TOTAL] 11.9349

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 0.876
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7432612  0.25673878] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.086
[MASKS] A(Pass/Fail): 713/1335 | B: 667/1381 | C: 627/1421
[LOSS Ex1] A: 0.62951 | B: 0.60991 | C: 0.60489
[LOGITS Ex2 A] Mean Abs: 2.272 | Max: 7.594
[LOSS Ex2] A: 0.11568 | B: 0.31632 | C: 0.23296
** [JOINT LOSS] ** : 0.836422
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003529 | Grad Max: 0.172563
  -> Layer: shared_layers.0.bias | Grad Mean: 0.411223 | Grad Max: 2.212615
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.005968
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004934 | Grad Max: 0.004934
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002498 | Grad Max: 0.582701
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045826 | Grad Max: 3.256716
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000237 | Grad Max: 0.009116
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022926 | Grad Max: 0.121818
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000364
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004362 | Grad Max: 0.009473
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000226
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001113 | Grad Max: 0.003412
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000562 | Grad Max: 0.001608
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016839 | Grad Max: 0.016839
[GRADIENT NORM TOTAL] 9.5616

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.015
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64240223 0.35759777] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.087
[MASKS] A(Pass/Fail): 590/1026 | B: 650/1398 | C: 646/1402
[LOSS Ex1] A: 0.62761 | B: 0.61026 | C: 0.59876
[LOGITS Ex2 A] Mean Abs: 2.344 | Max: 11.826
[LOSS Ex2] A: 0.10439 | B: 0.31554 | C: 0.22951
** [JOINT LOSS] ** : 0.828691
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004759 | Grad Max: 0.149250
  -> Layer: shared_layers.0.bias | Grad Mean: 0.310833 | Grad Max: 1.239121
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.005792
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007274 | Grad Max: 0.007274
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002422 | Grad Max: 0.604734
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043598 | Grad Max: 3.343949
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000213 | Grad Max: 0.006333
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020172 | Grad Max: 0.100328
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000343
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004094 | Grad Max: 0.009300
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000306
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000973 | Grad Max: 0.004350
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001494
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011621 | Grad Max: 0.011621
[GRADIENT NORM TOTAL] 8.1191

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.243
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50822777 0.4917723 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.087
[MASKS] A(Pass/Fail): 724/1324 | B: 662/1386 | C: 590/1458
[LOSS Ex1] A: 0.62836 | B: 0.60574 | C: 0.60778
[LOGITS Ex2 A] Mean Abs: 2.327 | Max: 9.094
[LOSS Ex2] A: 0.09374 | B: 0.28338 | C: 0.19103
** [JOINT LOSS] ** : 0.803344
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006148 | Grad Max: 0.210924
  -> Layer: shared_layers.0.bias | Grad Mean: 0.409164 | Grad Max: 2.504673
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.005705
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000266 | Grad Max: 0.000266
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002706 | Grad Max: 0.572337
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.048821 | Grad Max: 3.166521
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000241 | Grad Max: 0.007366
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022508 | Grad Max: 0.111783
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000395
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004784 | Grad Max: 0.010344
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000251
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001269 | Grad Max: 0.003461
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000607 | Grad Max: 0.002330
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018457 | Grad Max: 0.018457
[GRADIENT NORM TOTAL] 9.5495

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.173
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056154  0.49438456] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.087
[MASKS] A(Pass/Fail): 720/1328 | B: 619/1237 | C: 391/985
[LOSS Ex1] A: 0.62502 | B: 0.60980 | C: 0.60937
[LOGITS Ex2 A] Mean Abs: 2.304 | Max: 7.187
[LOSS Ex2] A: 0.09566 | B: 0.29434 | C: 0.20022
** [JOINT LOSS] ** : 0.811468
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002610 | Grad Max: 0.079053
  -> Layer: shared_layers.0.bias | Grad Mean: 0.151652 | Grad Max: 0.890185
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.006356
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006943 | Grad Max: 0.006943
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001281 | Grad Max: 0.179107
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022675 | Grad Max: 0.989885
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000120 | Grad Max: 0.004997
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010880 | Grad Max: 0.079939
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000217
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001849 | Grad Max: 0.005905
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000145
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000372 | Grad Max: 0.001565
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000407 | Grad Max: 0.001498
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004851 | Grad Max: 0.004851
[GRADIENT NORM TOTAL] 3.8489

[EPOCH SUMMARY] Train Loss: 0.8183

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8008 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8064 -> New: 0.8008)

############################## EPOCH 167/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.208
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5101742  0.48982576] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 722/1326 | B: 667/1381 | C: 594/1454
[LOSS Ex1] A: 0.62397 | B: 0.60973 | C: 0.60640
[LOGITS Ex2 A] Mean Abs: 2.296 | Max: 8.152
[LOSS Ex2] A: 0.10755 | B: 0.31635 | C: 0.20890
** [JOINT LOSS] ** : 0.824300
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002403 | Grad Max: 0.064545
  -> Layer: shared_layers.0.bias | Grad Mean: 0.095673 | Grad Max: 0.401898
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006096
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000449 | Grad Max: 0.000449
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000904 | Grad Max: 0.217278
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015605 | Grad Max: 1.205248
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.004254
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002811 | Grad Max: 0.038800
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000142
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000412 | Grad Max: 0.002903
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000076
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000103 | Grad Max: 0.000672
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000306 | Grad Max: 0.000895
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001367 | Grad Max: 0.001367
[GRADIENT NORM TOTAL] 3.3078

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.001
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007374 0.4992626] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 690/1358 | B: 650/1398 | C: 601/1447
[LOSS Ex1] A: 0.63293 | B: 0.61008 | C: 0.60971
[LOGITS Ex2 A] Mean Abs: 2.280 | Max: 7.258
[LOSS Ex2] A: 0.09835 | B: 0.30574 | C: 0.21551
** [JOINT LOSS] ** : 0.824106
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004456 | Grad Max: 0.110807
  -> Layer: shared_layers.0.bias | Grad Mean: 0.245933 | Grad Max: 1.224875
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001984 | Grad Max: 0.005503
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006957 | Grad Max: 0.006957
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001861 | Grad Max: 0.252478
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033079 | Grad Max: 1.381093
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.007691
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012862 | Grad Max: 0.111598
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000206
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002241 | Grad Max: 0.006577
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000143
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000507 | Grad Max: 0.001826
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000370 | Grad Max: 0.001174
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005711 | Grad Max: 0.005711
[GRADIENT NORM TOTAL] 5.9135

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.895
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54233676 0.45766327] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.561 | Std: 0.082
[MASKS] A(Pass/Fail): 693/1355 | B: 662/1386 | C: 623/1425
[LOSS Ex1] A: 0.63360 | B: 0.60555 | C: 0.60607
[LOGITS Ex2 A] Mean Abs: 2.264 | Max: 6.194
[LOSS Ex2] A: 0.11220 | B: 0.27957 | C: 0.21912
** [JOINT LOSS] ** : 0.818703
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005722 | Grad Max: 0.222514
  -> Layer: shared_layers.0.bias | Grad Mean: 0.141919 | Grad Max: 0.617660
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002113 | Grad Max: 0.005216
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005183 | Grad Max: 0.005183
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001162 | Grad Max: 0.385266
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018820 | Grad Max: 2.164001
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.003469
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004028 | Grad Max: 0.028385
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000228
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000933 | Grad Max: 0.003706
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000114
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000194 | Grad Max: 0.001026
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000357 | Grad Max: 0.001444
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003220 | Grad Max: 0.003220
[GRADIENT NORM TOTAL] 4.2841

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.123
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.81762815 0.18237184] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.087
[MASKS] A(Pass/Fail): 751/1297 | B: 619/1237 | C: 611/1437
[LOSS Ex1] A: 0.62700 | B: 0.60961 | C: 0.60709
[LOGITS Ex2 A] Mean Abs: 2.294 | Max: 6.718
[LOSS Ex2] A: 0.09615 | B: 0.28633 | C: 0.21470
** [JOINT LOSS] ** : 0.813623
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.082713
  -> Layer: shared_layers.0.bias | Grad Mean: 0.224596 | Grad Max: 1.200093
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.005472
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002559 | Grad Max: 0.002559
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001321 | Grad Max: 0.630204
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023953 | Grad Max: 3.495461
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000098 | Grad Max: 0.004291
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009082 | Grad Max: 0.042622
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000197
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001760 | Grad Max: 0.005543
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000154
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000400 | Grad Max: 0.002061
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000344 | Grad Max: 0.001616
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005902 | Grad Max: 0.005902
[GRADIENT NORM TOTAL] 6.1667

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.247
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004655  0.49953443] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.085
[MASKS] A(Pass/Fail): 723/1325 | B: 667/1381 | C: 620/1428
[LOSS Ex1] A: 0.63422 | B: 0.60954 | C: 0.60397
[LOGITS Ex2 A] Mean Abs: 2.325 | Max: 6.814
[LOSS Ex2] A: 0.09521 | B: 0.31391 | C: 0.20669
** [JOINT LOSS] ** : 0.821183
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003471 | Grad Max: 0.075288
  -> Layer: shared_layers.0.bias | Grad Mean: 0.168750 | Grad Max: 0.760853
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.005459
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000786 | Grad Max: 0.000786
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001449 | Grad Max: 0.554126
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026768 | Grad Max: 3.083837
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.006376
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013195 | Grad Max: 0.086450
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000227
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002505 | Grad Max: 0.006619
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000191
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000601 | Grad Max: 0.002326
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000350 | Grad Max: 0.001472
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007569 | Grad Max: 0.007569
[GRADIENT NORM TOTAL] 5.5283

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 0.881
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7440529 0.2559471] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.087
[MASKS] A(Pass/Fail): 714/1334 | B: 650/1398 | C: 694/1354
[LOSS Ex1] A: 0.62928 | B: 0.60989 | C: 0.59313
[LOGITS Ex2 A] Mean Abs: 2.305 | Max: 7.369
[LOSS Ex2] A: 0.10916 | B: 0.30541 | C: 0.19016
** [JOINT LOSS] ** : 0.812342
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002654 | Grad Max: 0.077930
  -> Layer: shared_layers.0.bias | Grad Mean: 0.186673 | Grad Max: 0.955453
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.005791
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005584 | Grad Max: 0.005584
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001035 | Grad Max: 0.702375
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018109 | Grad Max: 3.898429
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.003460
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004117 | Grad Max: 0.032318
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000151
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000650 | Grad Max: 0.003290
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000100
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000138 | Grad Max: 0.000778
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000379 | Grad Max: 0.001221
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001583 | Grad Max: 0.001583
[GRADIENT NORM TOTAL] 5.8278

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.021
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6428182  0.35718176] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.087
[MASKS] A(Pass/Fail): 590/1026 | B: 662/1386 | C: 606/1442
[LOSS Ex1] A: 0.62739 | B: 0.60536 | C: 0.60702
[LOGITS Ex2 A] Mean Abs: 2.368 | Max: 8.579
[LOSS Ex2] A: 0.09596 | B: 0.27734 | C: 0.23604
** [JOINT LOSS] ** : 0.816374
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003089 | Grad Max: 0.065351
  -> Layer: shared_layers.0.bias | Grad Mean: 0.093690 | Grad Max: 0.385697
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.005956
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004130 | Grad Max: 0.004130
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000839 | Grad Max: 0.136934
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014187 | Grad Max: 0.763568
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.003350
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004356 | Grad Max: 0.043493
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000140
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000691 | Grad Max: 0.003536
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000078
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000183 | Grad Max: 0.000822
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000266 | Grad Max: 0.001125
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003261 | Grad Max: 0.003261
[GRADIENT NORM TOTAL] 2.5093

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.250
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081689  0.49183115] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.087
[MASKS] A(Pass/Fail): 724/1324 | B: 619/1237 | C: 651/1397
[LOSS Ex1] A: 0.62813 | B: 0.60941 | C: 0.59917
[LOGITS Ex2 A] Mean Abs: 2.366 | Max: 10.593
[LOSS Ex2] A: 0.09165 | B: 0.29743 | C: 0.20837
** [JOINT LOSS] ** : 0.811388
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003958 | Grad Max: 0.126458
  -> Layer: shared_layers.0.bias | Grad Mean: 0.252780 | Grad Max: 1.105037
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.005556
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000225 | Grad Max: 0.000225
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001721 | Grad Max: 0.334367
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031221 | Grad Max: 1.846927
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000146 | Grad Max: 0.005960
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013307 | Grad Max: 0.082038
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000276
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002627 | Grad Max: 0.006256
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000195
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000656 | Grad Max: 0.002135
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000357 | Grad Max: 0.001681
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008833 | Grad Max: 0.008833
[GRADIENT NORM TOTAL] 6.1480

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.178
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50562143 0.49437857] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.087
[MASKS] A(Pass/Fail): 720/1328 | B: 667/1381 | C: 633/1415
[LOSS Ex1] A: 0.62479 | B: 0.60935 | C: 0.60307
[LOGITS Ex2 A] Mean Abs: 2.335 | Max: 6.259
[LOSS Ex2] A: 0.09841 | B: 0.31000 | C: 0.21636
** [JOINT LOSS] ** : 0.820662
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.086094
  -> Layer: shared_layers.0.bias | Grad Mean: 0.143873 | Grad Max: 0.906196
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.005967
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000239 | Grad Max: 0.000239
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001076 | Grad Max: 0.447286
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019194 | Grad Max: 2.455468
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.003565
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003618 | Grad Max: 0.040545
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000149
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000686 | Grad Max: 0.003622
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000113
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000184 | Grad Max: 0.001235
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000249 | Grad Max: 0.001147
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003151 | Grad Max: 0.003151
[GRADIENT NORM TOTAL] 4.7771

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.214
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51026076 0.48973924] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.087
[MASKS] A(Pass/Fail): 722/1326 | B: 650/1398 | C: 581/1467
[LOSS Ex1] A: 0.62374 | B: 0.60969 | C: 0.60801
[LOGITS Ex2 A] Mean Abs: 2.308 | Max: 7.550
[LOSS Ex2] A: 0.11212 | B: 0.30703 | C: 0.19579
** [JOINT LOSS] ** : 0.818793
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002389 | Grad Max: 0.049043
  -> Layer: shared_layers.0.bias | Grad Mean: 0.110867 | Grad Max: 0.547526
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002162 | Grad Max: 0.006123
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001624 | Grad Max: 0.001624
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000945 | Grad Max: 0.344475
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016783 | Grad Max: 1.895428
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000040 | Grad Max: 0.002647
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002186 | Grad Max: 0.026465
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000157
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000400 | Grad Max: 0.002413
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000076
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000100 | Grad Max: 0.000627
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.001060
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001874 | Grad Max: 0.001874
[GRADIENT NORM TOTAL] 3.9053

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.006
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50065976 0.49934024] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 690/1358 | B: 663/1385 | C: 601/1447
[LOSS Ex1] A: 0.63271 | B: 0.60516 | C: 0.61105
[LOGITS Ex2 A] Mean Abs: 2.293 | Max: 6.597
[LOSS Ex2] A: 0.09957 | B: 0.28872 | C: 0.22077
** [JOINT LOSS] ** : 0.819326
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003585 | Grad Max: 0.133432
  -> Layer: shared_layers.0.bias | Grad Mean: 0.182276 | Grad Max: 0.858926
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.005091
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005192 | Grad Max: 0.005192
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001480 | Grad Max: 0.453397
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026759 | Grad Max: 2.531817
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000143 | Grad Max: 0.006456
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013265 | Grad Max: 0.073197
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000311
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002675 | Grad Max: 0.006453
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000168
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000660 | Grad Max: 0.001964
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000391 | Grad Max: 0.001632
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010340 | Grad Max: 0.010340
[GRADIENT NORM TOTAL] 5.0917

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.900
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5423908  0.45760918] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.082
[MASKS] A(Pass/Fail): 693/1355 | B: 619/1237 | C: 643/1405
[LOSS Ex1] A: 0.63339 | B: 0.60920 | C: 0.59924
[LOGITS Ex2 A] Mean Abs: 2.342 | Max: 6.787
[LOSS Ex2] A: 0.09804 | B: 0.29184 | C: 0.22302
** [JOINT LOSS] ** : 0.818240
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002976 | Grad Max: 0.150096
  -> Layer: shared_layers.0.bias | Grad Mean: 0.342726 | Grad Max: 1.551452
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.005849
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009985 | Grad Max: 0.009985
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.641229
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.039529 | Grad Max: 3.560706
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000189 | Grad Max: 0.008927
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018609 | Grad Max: 0.108694
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000301
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003461 | Grad Max: 0.007809
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000221
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000843 | Grad Max: 0.003348
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000346 | Grad Max: 0.001772
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010637 | Grad Max: 0.010637
[GRADIENT NORM TOTAL] 8.4603

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.129
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8188314  0.18116868] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.087
[MASKS] A(Pass/Fail): 751/1297 | B: 667/1381 | C: 637/1411
[LOSS Ex1] A: 0.62676 | B: 0.60916 | C: 0.60256
[LOGITS Ex2 A] Mean Abs: 2.339 | Max: 7.543
[LOSS Ex2] A: 0.09541 | B: 0.30048 | C: 0.22518
** [JOINT LOSS] ** : 0.819850
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004473 | Grad Max: 0.137099
  -> Layer: shared_layers.0.bias | Grad Mean: 0.304428 | Grad Max: 1.590001
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.005974
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003925 | Grad Max: 0.003925
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001996 | Grad Max: 0.284864
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036865 | Grad Max: 1.585055
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000233 | Grad Max: 0.007143
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022327 | Grad Max: 0.107942
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000360
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004215 | Grad Max: 0.009684
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000256
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000990 | Grad Max: 0.003252
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000457 | Grad Max: 0.001827
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013148 | Grad Max: 0.013148
[GRADIENT NORM TOTAL] 6.4284

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.255
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004184  0.49958155] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.086
[MASKS] A(Pass/Fail): 724/1324 | B: 650/1398 | C: 420/956
[LOSS Ex1] A: 0.63400 | B: 0.60949 | C: 0.60297
[LOGITS Ex2 A] Mean Abs: 2.347 | Max: 7.354
[LOSS Ex2] A: 0.09163 | B: 0.30899 | C: 0.22874
** [JOINT LOSS] ** : 0.825280
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006797 | Grad Max: 0.217317
  -> Layer: shared_layers.0.bias | Grad Mean: 0.571175 | Grad Max: 2.851531
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.005451
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005886 | Grad Max: 0.005886
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003754 | Grad Max: 0.886042
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069341 | Grad Max: 4.910470
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.013805
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035112 | Grad Max: 0.183642
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000524
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006883 | Grad Max: 0.014041
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000348
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001675 | Grad Max: 0.005251
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000891 | Grad Max: 0.002447
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024805 | Grad Max: 0.024805
[GRADIENT NORM TOTAL] 13.3506

[EPOCH SUMMARY] Train Loss: 0.8189

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8066 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 168/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.886
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.74505603 0.25494394] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.087
[MASKS] A(Pass/Fail): 714/1334 | B: 663/1385 | C: 661/1387
[LOSS Ex1] A: 0.62905 | B: 0.60497 | C: 0.59624
[LOGITS Ex2 A] Mean Abs: 2.284 | Max: 7.355
[LOSS Ex2] A: 0.11288 | B: 0.29862 | C: 0.20390
** [JOINT LOSS] ** : 0.815218
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005851 | Grad Max: 0.208375
  -> Layer: shared_layers.0.bias | Grad Mean: 0.622774 | Grad Max: 2.841291
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.005804
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004282 | Grad Max: 0.004282
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003717 | Grad Max: 0.808064
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069611 | Grad Max: 4.480676
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.013668
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037405 | Grad Max: 0.205699
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000562
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007090 | Grad Max: 0.016260
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000365
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001679 | Grad Max: 0.005166
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000825 | Grad Max: 0.002489
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023594 | Grad Max: 0.023594
[GRADIENT NORM TOTAL] 13.3534

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.027
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64344144 0.3565585 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.087
[MASKS] A(Pass/Fail): 590/1026 | B: 619/1237 | C: 626/1422
[LOSS Ex1] A: 0.62716 | B: 0.60901 | C: 0.60712
[LOGITS Ex2 A] Mean Abs: 2.383 | Max: 9.593
[LOSS Ex2] A: 0.09469 | B: 0.28779 | C: 0.21080
** [JOINT LOSS] ** : 0.812190
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003895 | Grad Max: 0.086758
  -> Layer: shared_layers.0.bias | Grad Mean: 0.249495 | Grad Max: 1.045749
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002049 | Grad Max: 0.005714
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007930 | Grad Max: 0.007930
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001620 | Grad Max: 0.647319
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.029691 | Grad Max: 3.587425
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000162 | Grad Max: 0.006319
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015111 | Grad Max: 0.090946
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000307
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002927 | Grad Max: 0.008047
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000171
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000697 | Grad Max: 0.002229
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000397 | Grad Max: 0.001639
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010630 | Grad Max: 0.010630
[GRADIENT NORM TOTAL] 6.5535

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.257
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081495 0.4918505] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 724/1324 | B: 667/1381 | C: 624/1424
[LOSS Ex1] A: 0.62790 | B: 0.60899 | C: 0.60305
[LOGITS Ex2 A] Mean Abs: 2.394 | Max: 9.426
[LOSS Ex2] A: 0.09029 | B: 0.32260 | C: 0.22754
** [JOINT LOSS] ** : 0.826788
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010400 | Grad Max: 0.312958
  -> Layer: shared_layers.0.bias | Grad Mean: 0.909919 | Grad Max: 4.137902
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.005422
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004835 | Grad Max: 0.004835
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006121 | Grad Max: 0.841210
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.113418 | Grad Max: 4.678674
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000641 | Grad Max: 0.019609
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.061906 | Grad Max: 0.326748
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000869
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012105 | Grad Max: 0.023650
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000624
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002912 | Grad Max: 0.009282
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001404 | Grad Max: 0.003630
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040670 | Grad Max: 0.040670
[GRADIENT NORM TOTAL] 19.8104

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.183
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056449  0.49435502] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.087
[MASKS] A(Pass/Fail): 720/1328 | B: 650/1398 | C: 620/1428
[LOSS Ex1] A: 0.62455 | B: 0.60933 | C: 0.60521
[LOGITS Ex2 A] Mean Abs: 2.412 | Max: 6.095
[LOSS Ex2] A: 0.10952 | B: 0.33346 | C: 0.23266
** [JOINT LOSS] ** : 0.838244
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011680 | Grad Max: 0.443724
  -> Layer: shared_layers.0.bias | Grad Mean: 1.144838 | Grad Max: 5.899863
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.005639
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002615 | Grad Max: 0.002615
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007602 | Grad Max: 1.180987
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.141186 | Grad Max: 6.609797
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000764 | Grad Max: 0.024371
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.074551 | Grad Max: 0.384120
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001044
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.014483 | Grad Max: 0.028879
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000757
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003496 | Grad Max: 0.010978
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001645 | Grad Max: 0.003695
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048780 | Grad Max: 0.048780
[GRADIENT NORM TOTAL] 25.8887

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.219
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51034486 0.48965514] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.087
[MASKS] A(Pass/Fail): 723/1325 | B: 663/1385 | C: 623/1425
[LOSS Ex1] A: 0.62350 | B: 0.60481 | C: 0.60710
[LOGITS Ex2 A] Mean Abs: 2.358 | Max: 7.094
[LOSS Ex2] A: 0.11754 | B: 0.28419 | C: 0.23630
** [JOINT LOSS] ** : 0.824480
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010667 | Grad Max: 0.284262
  -> Layer: shared_layers.0.bias | Grad Mean: 0.683029 | Grad Max: 3.059067
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002217 | Grad Max: 0.005626
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002747 | Grad Max: 0.002747
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004705 | Grad Max: 0.584230
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.086043 | Grad Max: 3.294141
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000500 | Grad Max: 0.015212
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047902 | Grad Max: 0.226146
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000746
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009545 | Grad Max: 0.019975
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000487
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002310 | Grad Max: 0.007365
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001084 | Grad Max: 0.002713
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031268 | Grad Max: 0.031268
[GRADIENT NORM TOTAL] 15.0644

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.010
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50061744 0.49938253] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.086
[MASKS] A(Pass/Fail): 690/1358 | B: 619/1237 | C: 635/1413
[LOSS Ex1] A: 0.63249 | B: 0.60886 | C: 0.60014
[LOGITS Ex2 A] Mean Abs: 2.264 | Max: 6.128
[LOSS Ex2] A: 0.09362 | B: 0.29582 | C: 0.19464
** [JOINT LOSS] ** : 0.808529
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003650 | Grad Max: 0.170498
  -> Layer: shared_layers.0.bias | Grad Mean: 0.499427 | Grad Max: 2.437197
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002083 | Grad Max: 0.005724
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003556 | Grad Max: 0.003556
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003387 | Grad Max: 0.639325
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.062937 | Grad Max: 3.578642
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.011658
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035387 | Grad Max: 0.199403
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000464
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006668 | Grad Max: 0.014120
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000381
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001571 | Grad Max: 0.005670
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000740 | Grad Max: 0.002467
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021724 | Grad Max: 0.021724
[GRADIENT NORM TOTAL] 11.8035

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.904
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.542426   0.45757404] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.082
[MASKS] A(Pass/Fail): 693/1355 | B: 667/1381 | C: 590/1458
[LOSS Ex1] A: 0.63319 | B: 0.60886 | C: 0.60700
[LOGITS Ex2 A] Mean Abs: 2.230 | Max: 6.861
[LOSS Ex2] A: 0.10624 | B: 0.33184 | C: 0.21821
** [JOINT LOSS] ** : 0.835114
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009472 | Grad Max: 0.300976
  -> Layer: shared_layers.0.bias | Grad Mean: 0.868006 | Grad Max: 4.091258
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.005910
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011968 | Grad Max: 0.011968
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005751 | Grad Max: 0.834226
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.107164 | Grad Max: 4.706403
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000606 | Grad Max: 0.020790
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.059512 | Grad Max: 0.329571
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000801
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011487 | Grad Max: 0.022933
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000573
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002766 | Grad Max: 0.008339
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001314 | Grad Max: 0.003140
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039173 | Grad Max: 0.039173
[GRADIENT NORM TOTAL] 19.5581

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.134
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.819793   0.18020703] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.087
[MASKS] A(Pass/Fail): 751/1297 | B: 650/1398 | C: 654/1394
[LOSS Ex1] A: 0.62656 | B: 0.60920 | C: 0.59817
[LOGITS Ex2 A] Mean Abs: 2.289 | Max: 8.277
[LOSS Ex2] A: 0.08837 | B: 0.31896 | C: 0.20644
** [JOINT LOSS] ** : 0.815902
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004179 | Grad Max: 0.168744
  -> Layer: shared_layers.0.bias | Grad Mean: 0.466764 | Grad Max: 2.330323
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.005863
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003579 | Grad Max: 0.003579
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003140 | Grad Max: 0.578967
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057940 | Grad Max: 3.229973
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000321 | Grad Max: 0.009933
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031829 | Grad Max: 0.161595
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000459
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006280 | Grad Max: 0.013336
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000349
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001524 | Grad Max: 0.004956
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000741 | Grad Max: 0.002387
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022034 | Grad Max: 0.022034
[GRADIENT NORM TOTAL] 10.7028

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.260
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004349  0.49956506] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.086
[MASKS] A(Pass/Fail): 724/1324 | B: 663/1385 | C: 643/1405
[LOSS Ex1] A: 0.63382 | B: 0.60469 | C: 0.60556
[LOGITS Ex2 A] Mean Abs: 2.368 | Max: 6.481
[LOSS Ex2] A: 0.09469 | B: 0.28366 | C: 0.21516
** [JOINT LOSS] ** : 0.812524
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004523 | Grad Max: 0.207978
  -> Layer: shared_layers.0.bias | Grad Mean: 0.486040 | Grad Max: 2.702191
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.005315
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000234 | Grad Max: 0.000234
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003086 | Grad Max: 0.685378
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057051 | Grad Max: 3.822773
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.009758
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029524 | Grad Max: 0.160769
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000389
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005454 | Grad Max: 0.011505
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000276
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001336 | Grad Max: 0.004004
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000561 | Grad Max: 0.001956
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017831 | Grad Max: 0.017831
[GRADIENT NORM TOTAL] 11.0947

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.889
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7456844  0.25431558] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.087
[MASKS] A(Pass/Fail): 714/1334 | B: 619/1237 | C: 653/1395
[LOSS Ex1] A: 0.62886 | B: 0.60874 | C: 0.59874
[LOGITS Ex2 A] Mean Abs: 2.352 | Max: 7.731
[LOSS Ex2] A: 0.10915 | B: 0.30079 | C: 0.23099
** [JOINT LOSS] ** : 0.825759
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005636 | Grad Max: 0.242662
  -> Layer: shared_layers.0.bias | Grad Mean: 0.661550 | Grad Max: 3.143151
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.005601
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007476 | Grad Max: 0.007476
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004252 | Grad Max: 0.916355
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.079221 | Grad Max: 5.100037
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000449 | Grad Max: 0.014981
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.044275 | Grad Max: 0.228076
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000570
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008459 | Grad Max: 0.017375
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000451
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002059 | Grad Max: 0.007052
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000924 | Grad Max: 0.002464
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028323 | Grad Max: 0.028323
[GRADIENT NORM TOTAL] 14.8431

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.031
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6437635  0.35623655] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.087
[MASKS] A(Pass/Fail): 590/1026 | B: 667/1381 | C: 610/1438
[LOSS Ex1] A: 0.62698 | B: 0.60875 | C: 0.60593
[LOGITS Ex2 A] Mean Abs: 2.372 | Max: 9.242
[LOSS Ex2] A: 0.09110 | B: 0.29982 | C: 0.20724
** [JOINT LOSS] ** : 0.813275
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003962 | Grad Max: 0.081903
  -> Layer: shared_layers.0.bias | Grad Mean: 0.248595 | Grad Max: 0.939856
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.005515
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003663 | Grad Max: 0.003663
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001748 | Grad Max: 0.472832
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032105 | Grad Max: 2.614952
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.007469
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017933 | Grad Max: 0.108190
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000317
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003436 | Grad Max: 0.008613
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000190
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000846 | Grad Max: 0.002540
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000413 | Grad Max: 0.001711
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012072 | Grad Max: 0.012072
[GRADIENT NORM TOTAL] 5.7169

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.261
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50816494 0.49183503] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 724/1324 | B: 650/1398 | C: 664/1384
[LOSS Ex1] A: 0.62773 | B: 0.60909 | C: 0.60271
[LOGITS Ex2 A] Mean Abs: 2.288 | Max: 9.400
[LOSS Ex2] A: 0.09338 | B: 0.31226 | C: 0.20576
** [JOINT LOSS] ** : 0.816980
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008207 | Grad Max: 0.271078
  -> Layer: shared_layers.0.bias | Grad Mean: 0.745104 | Grad Max: 3.581179
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.005660
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002665 | Grad Max: 0.002665
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004624 | Grad Max: 0.638362
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.086013 | Grad Max: 3.525337
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000489 | Grad Max: 0.016872
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048066 | Grad Max: 0.281649
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000621
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009275 | Grad Max: 0.018398
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000484
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002241 | Grad Max: 0.007361
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001060 | Grad Max: 0.003042
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031536 | Grad Max: 0.031536
[GRADIENT NORM TOTAL] 16.0311

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.187
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50558144 0.49441853] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 720/1328 | B: 663/1385 | C: 610/1438
[LOSS Ex1] A: 0.62438 | B: 0.60458 | C: 0.60602
[LOGITS Ex2 A] Mean Abs: 2.251 | Max: 5.944
[LOSS Ex2] A: 0.10959 | B: 0.30556 | C: 0.20347
** [JOINT LOSS] ** : 0.817869
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008310 | Grad Max: 0.312760
  -> Layer: shared_layers.0.bias | Grad Mean: 0.915854 | Grad Max: 4.217233
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.005986
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000290 | Grad Max: 0.000290
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005703 | Grad Max: 0.719859
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.106510 | Grad Max: 3.991581
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.018721
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.060389 | Grad Max: 0.313958
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000828
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011608 | Grad Max: 0.023188
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000621
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002765 | Grad Max: 0.009101
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001281 | Grad Max: 0.003213
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038033 | Grad Max: 0.038033
[GRADIENT NORM TOTAL] 19.5138

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.223
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51042783 0.48957214] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.087
[MASKS] A(Pass/Fail): 724/1324 | B: 619/1237 | C: 412/964
[LOSS Ex1] A: 0.62334 | B: 0.60863 | C: 0.61089
[LOGITS Ex2 A] Mean Abs: 2.247 | Max: 6.802
[LOSS Ex2] A: 0.11140 | B: 0.29676 | C: 0.22656
** [JOINT LOSS] ** : 0.825863
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004724 | Grad Max: 0.200558
  -> Layer: shared_layers.0.bias | Grad Mean: 0.509727 | Grad Max: 2.818214
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.006264
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003332 | Grad Max: 0.003332
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003042 | Grad Max: 0.470019
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.056255 | Grad Max: 2.597116
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.010626
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029964 | Grad Max: 0.174075
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000466
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005530 | Grad Max: 0.013126
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000329
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001356 | Grad Max: 0.004428
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000661 | Grad Max: 0.002134
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019738 | Grad Max: 0.019738
[GRADIENT NORM TOTAL] 10.9879

[EPOCH SUMMARY] Train Loss: 0.8206

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8016 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 169/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.013
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500606   0.49939394] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.086
[MASKS] A(Pass/Fail): 690/1358 | B: 667/1381 | C: 633/1415
[LOSS Ex1] A: 0.63234 | B: 0.60865 | C: 0.60774
[LOGITS Ex2 A] Mean Abs: 2.285 | Max: 7.788
[LOSS Ex2] A: 0.09681 | B: 0.30810 | C: 0.23356
** [JOINT LOSS] ** : 0.829062
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006319 | Grad Max: 0.154288
  -> Layer: shared_layers.0.bias | Grad Mean: 0.469085 | Grad Max: 2.047938
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.005496
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007104 | Grad Max: 0.007104
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003096 | Grad Max: 0.417115
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057015 | Grad Max: 2.329649
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000321 | Grad Max: 0.009252
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031076 | Grad Max: 0.153647
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000491
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006084 | Grad Max: 0.013210
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000337
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001433 | Grad Max: 0.004704
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000635 | Grad Max: 0.001934
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018797 | Grad Max: 0.018797
[GRADIENT NORM TOTAL] 9.9886

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.907
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54231215 0.45768788] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.083
[MASKS] A(Pass/Fail): 693/1355 | B: 650/1398 | C: 620/1428
[LOSS Ex1] A: 0.63305 | B: 0.60900 | C: 0.60279
[LOGITS Ex2 A] Mean Abs: 2.280 | Max: 6.184
[LOSS Ex2] A: 0.10984 | B: 0.31058 | C: 0.20781
** [JOINT LOSS] ** : 0.824354
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007775 | Grad Max: 0.308092
  -> Layer: shared_layers.0.bias | Grad Mean: 0.818178 | Grad Max: 4.140939
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005503
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008216 | Grad Max: 0.008216
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005169 | Grad Max: 0.794147
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.095910 | Grad Max: 4.435971
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000524 | Grad Max: 0.017047
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.051980 | Grad Max: 0.264520
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000735
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010091 | Grad Max: 0.021459
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000490
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002431 | Grad Max: 0.007647
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001081 | Grad Max: 0.003065
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033203 | Grad Max: 0.033203
[GRADIENT NORM TOTAL] 17.9708

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.137
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.82045096 0.17954908] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.087
[MASKS] A(Pass/Fail): 751/1297 | B: 663/1385 | C: 649/1399
[LOSS Ex1] A: 0.62642 | B: 0.60449 | C: 0.60412
[LOGITS Ex2 A] Mean Abs: 2.281 | Max: 9.082
[LOSS Ex2] A: 0.09225 | B: 0.27822 | C: 0.20209
** [JOINT LOSS] ** : 0.802530
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005375 | Grad Max: 0.161442
  -> Layer: shared_layers.0.bias | Grad Mean: 0.383604 | Grad Max: 1.780985
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.005932
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006105 | Grad Max: 0.006105
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002535 | Grad Max: 0.363011
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045837 | Grad Max: 2.001706
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000250 | Grad Max: 0.008834
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024259 | Grad Max: 0.136094
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000395
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004898 | Grad Max: 0.010537
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000267
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001239 | Grad Max: 0.003955
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000558 | Grad Max: 0.002203
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017120 | Grad Max: 0.017120
[GRADIENT NORM TOTAL] 8.3517

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.263
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004848  0.49951515] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.086
[MASKS] A(Pass/Fail): 725/1323 | B: 620/1236 | C: 669/1379
[LOSS Ex1] A: 0.63369 | B: 0.60853 | C: 0.59585
[LOGITS Ex2 A] Mean Abs: 2.252 | Max: 7.508
[LOSS Ex2] A: 0.09017 | B: 0.30575 | C: 0.19775
** [JOINT LOSS] ** : 0.810581
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008374 | Grad Max: 0.206446
  -> Layer: shared_layers.0.bias | Grad Mean: 0.671774 | Grad Max: 3.003193
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.005675
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000186 | Grad Max: 0.000186
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004496 | Grad Max: 0.807122
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.083886 | Grad Max: 4.490779
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000485 | Grad Max: 0.014930
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047003 | Grad Max: 0.244991
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000696
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009071 | Grad Max: 0.018650
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000510
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002132 | Grad Max: 0.007321
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001026 | Grad Max: 0.003031
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029172 | Grad Max: 0.029172
[GRADIENT NORM TOTAL] 14.7660

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.892
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7461759  0.25382408] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.087
[MASKS] A(Pass/Fail): 713/1335 | B: 667/1381 | C: 612/1436
[LOSS Ex1] A: 0.62872 | B: 0.60856 | C: 0.60853
[LOGITS Ex2 A] Mean Abs: 2.192 | Max: 7.045
[LOSS Ex2] A: 0.11413 | B: 0.33829 | C: 0.23157
** [JOINT LOSS] ** : 0.843266
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011787 | Grad Max: 0.324565
  -> Layer: shared_layers.0.bias | Grad Mean: 0.922320 | Grad Max: 4.321826
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002024 | Grad Max: 0.005464
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009236 | Grad Max: 0.009236
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006294 | Grad Max: 1.018728
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.117124 | Grad Max: 5.636506
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000653 | Grad Max: 0.022415
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.063799 | Grad Max: 0.336156
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000871
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012513 | Grad Max: 0.024739
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000640
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003078 | Grad Max: 0.009554
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001536 | Grad Max: 0.003565
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044486 | Grad Max: 0.044486
[GRADIENT NORM TOTAL] 20.7903

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.034
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6439525 0.3560475] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.088
[MASKS] A(Pass/Fail): 590/1026 | B: 650/1398 | C: 632/1416
[LOSS Ex1] A: 0.62684 | B: 0.60891 | C: 0.60514
[LOGITS Ex2 A] Mean Abs: 2.261 | Max: 7.236
[LOSS Ex2] A: 0.09945 | B: 0.30863 | C: 0.23930
** [JOINT LOSS] ** : 0.829423
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007403 | Grad Max: 0.177557
  -> Layer: shared_layers.0.bias | Grad Mean: 0.546236 | Grad Max: 2.174809
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.006427
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013609 | Grad Max: 0.013609
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003616 | Grad Max: 0.929129
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.067045 | Grad Max: 5.160577
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000418 | Grad Max: 0.013327
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.040483 | Grad Max: 0.223030
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000570
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007982 | Grad Max: 0.015324
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000419
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001991 | Grad Max: 0.005948
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001023 | Grad Max: 0.002813
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029583 | Grad Max: 0.029583
[GRADIENT NORM TOTAL] 12.3382

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.264
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50818455 0.49181545] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 724/1324 | B: 663/1385 | C: 631/1417
[LOSS Ex1] A: 0.62759 | B: 0.60441 | C: 0.60272
[LOGITS Ex2 A] Mean Abs: 2.324 | Max: 7.268
[LOSS Ex2] A: 0.08583 | B: 0.29242 | C: 0.21780
** [JOINT LOSS] ** : 0.810258
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005835 | Grad Max: 0.304969
  -> Layer: shared_layers.0.bias | Grad Mean: 0.705495 | Grad Max: 4.137354
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.005682
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001760 | Grad Max: 0.001760
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004375 | Grad Max: 0.687680
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.081342 | Grad Max: 3.801893
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000429 | Grad Max: 0.013369
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.043184 | Grad Max: 0.218275
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000578
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008313 | Grad Max: 0.017177
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000396
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002020 | Grad Max: 0.005733
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000831 | Grad Max: 0.002801
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027085 | Grad Max: 0.027085
[GRADIENT NORM TOTAL] 15.9979

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.189
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5054401  0.49455982] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 720/1328 | B: 620/1236 | C: 653/1395
[LOSS Ex1] A: 0.62425 | B: 0.60845 | C: 0.59974
[LOGITS Ex2 A] Mean Abs: 2.342 | Max: 8.112
[LOSS Ex2] A: 0.11209 | B: 0.31312 | C: 0.23254
** [JOINT LOSS] ** : 0.830065
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010043 | Grad Max: 0.403141
  -> Layer: shared_layers.0.bias | Grad Mean: 1.037346 | Grad Max: 5.367427
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002228 | Grad Max: 0.006371
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003261 | Grad Max: 0.003261
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006606 | Grad Max: 0.964795
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.123083 | Grad Max: 5.326637
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000684 | Grad Max: 0.023427
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.067982 | Grad Max: 0.378365
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.000865
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013199 | Grad Max: 0.026121
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000661
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003246 | Grad Max: 0.010234
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001473 | Grad Max: 0.003327
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044933 | Grad Max: 0.044933
[GRADIENT NORM TOTAL] 22.9372

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.225
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105302 0.4894698] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.087
[MASKS] A(Pass/Fail): 723/1325 | B: 667/1381 | C: 637/1411
[LOSS Ex1] A: 0.62321 | B: 0.60849 | C: 0.60586
[LOGITS Ex2 A] Mean Abs: 2.295 | Max: 7.286
[LOSS Ex2] A: 0.10958 | B: 0.31550 | C: 0.23599
** [JOINT LOSS] ** : 0.832877
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007754 | Grad Max: 0.289705
  -> Layer: shared_layers.0.bias | Grad Mean: 0.780934 | Grad Max: 4.050404
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.006061
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001791 | Grad Max: 0.001791
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005041 | Grad Max: 0.774583
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.093456 | Grad Max: 4.306075
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000532 | Grad Max: 0.017860
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.052942 | Grad Max: 0.284025
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000693
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010276 | Grad Max: 0.020700
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000481
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002417 | Grad Max: 0.007714
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001062 | Grad Max: 0.002810
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031970 | Grad Max: 0.031970
[GRADIENT NORM TOTAL] 17.4558

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.015
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006144  0.49938563] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.086
[MASKS] A(Pass/Fail): 689/1359 | B: 650/1398 | C: 631/1417
[LOSS Ex1] A: 0.63221 | B: 0.60884 | C: 0.59844
[LOGITS Ex2 A] Mean Abs: 2.221 | Max: 6.981
[LOSS Ex2] A: 0.09588 | B: 0.30451 | C: 0.20467
** [JOINT LOSS] ** : 0.814853
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002597 | Grad Max: 0.081854
  -> Layer: shared_layers.0.bias | Grad Mean: 0.244519 | Grad Max: 1.197058
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.005711
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008681 | Grad Max: 0.008681
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001523 | Grad Max: 0.681634
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027652 | Grad Max: 3.788115
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.005542
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010730 | Grad Max: 0.074680
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000238
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002075 | Grad Max: 0.005545
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000158
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000475 | Grad Max: 0.002089
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000379 | Grad Max: 0.001718
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007102 | Grad Max: 0.007102
[GRADIENT NORM TOTAL] 7.2126

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.909
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54224277 0.45775723] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.083
[MASKS] A(Pass/Fail): 693/1355 | B: 663/1385 | C: 627/1421
[LOSS Ex1] A: 0.63293 | B: 0.60434 | C: 0.60359
[LOGITS Ex2 A] Mean Abs: 2.182 | Max: 6.290
[LOSS Ex2] A: 0.11644 | B: 0.29790 | C: 0.19456
** [JOINT LOSS] ** : 0.816586
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006606 | Grad Max: 0.217319
  -> Layer: shared_layers.0.bias | Grad Mean: 0.587575 | Grad Max: 2.946973
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.005235
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003682 | Grad Max: 0.003682
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003935 | Grad Max: 0.664878
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.073101 | Grad Max: 3.729229
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000375 | Grad Max: 0.012620
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037124 | Grad Max: 0.198991
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000538
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007207 | Grad Max: 0.015318
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000441
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001715 | Grad Max: 0.006271
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000776 | Grad Max: 0.002552
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023053 | Grad Max: 0.023053
[GRADIENT NORM TOTAL] 13.5601

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.140
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8208916 0.1791083] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.087
[MASKS] A(Pass/Fail): 750/1298 | B: 620/1236 | C: 622/1426
[LOSS Ex1] A: 0.62631 | B: 0.60838 | C: 0.60325
[LOGITS Ex2 A] Mean Abs: 2.243 | Max: 6.657
[LOSS Ex2] A: 0.08730 | B: 0.29662 | C: 0.21957
** [JOINT LOSS] ** : 0.813812
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003363 | Grad Max: 0.193802
  -> Layer: shared_layers.0.bias | Grad Mean: 0.475557 | Grad Max: 2.492187
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.005193
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002084 | Grad Max: 0.002084
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003116 | Grad Max: 0.363835
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058099 | Grad Max: 2.053047
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000313 | Grad Max: 0.010879
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031400 | Grad Max: 0.167021
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000448
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006025 | Grad Max: 0.012564
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000372
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001443 | Grad Max: 0.005393
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000656 | Grad Max: 0.002311
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019855 | Grad Max: 0.019855
[GRADIENT NORM TOTAL] 10.7189

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.266
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005275 0.4994725] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.086
[MASKS] A(Pass/Fail): 725/1323 | B: 667/1381 | C: 639/1409
[LOSS Ex1] A: 0.63357 | B: 0.60842 | C: 0.60217
[LOGITS Ex2 A] Mean Abs: 2.292 | Max: 7.176
[LOSS Ex2] A: 0.09235 | B: 0.30931 | C: 0.21520
** [JOINT LOSS] ** : 0.820341
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004140 | Grad Max: 0.193104
  -> Layer: shared_layers.0.bias | Grad Mean: 0.366083 | Grad Max: 2.231292
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.006318
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005710 | Grad Max: 0.005710
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002296 | Grad Max: 0.537838
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042401 | Grad Max: 2.999084
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000236 | Grad Max: 0.009156
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023190 | Grad Max: 0.130984
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000362
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004529 | Grad Max: 0.009887
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000218
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001120 | Grad Max: 0.003387
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000453 | Grad Max: 0.001978
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014482 | Grad Max: 0.014482
[GRADIENT NORM TOTAL] 8.5070

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.894
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7463964  0.25360358] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.087
[MASKS] A(Pass/Fail): 713/1335 | B: 650/1398 | C: 412/964
[LOSS Ex1] A: 0.62860 | B: 0.60876 | C: 0.60860
[LOGITS Ex2 A] Mean Abs: 2.271 | Max: 6.104
[LOSS Ex2] A: 0.10731 | B: 0.31244 | C: 0.20876
** [JOINT LOSS] ** : 0.824825
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005685 | Grad Max: 0.232836
  -> Layer: shared_layers.0.bias | Grad Mean: 0.568788 | Grad Max: 3.028514
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002052 | Grad Max: 0.005115
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000046 | Grad Max: 0.000046
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003658 | Grad Max: 0.819113
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.067476 | Grad Max: 4.576969
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000370 | Grad Max: 0.011554
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036482 | Grad Max: 0.188060
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000464
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007167 | Grad Max: 0.014685
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000375
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001772 | Grad Max: 0.005700
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000810 | Grad Max: 0.002472
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024804 | Grad Max: 0.024804
[GRADIENT NORM TOTAL] 13.1534

[EPOCH SUMMARY] Train Loss: 0.8216

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.7990 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.8008 -> New: 0.7990)

############################## EPOCH 170/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.037
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6440113  0.35598868] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.088
[MASKS] A(Pass/Fail): 590/1026 | B: 663/1385 | C: 664/1384
[LOSS Ex1] A: 0.62672 | B: 0.60426 | C: 0.59545
[LOGITS Ex2 A] Mean Abs: 2.285 | Max: 12.082
[LOSS Ex2] A: 0.09960 | B: 0.28249 | C: 0.21549
** [JOINT LOSS] ** : 0.808004
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002403 | Grad Max: 0.060595
  -> Layer: shared_layers.0.bias | Grad Mean: 0.088958 | Grad Max: 0.386877
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006309
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009027 | Grad Max: 0.009027
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000716 | Grad Max: 0.349422
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012279 | Grad Max: 1.932204
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.002985
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003685 | Grad Max: 0.036730
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000141
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000728 | Grad Max: 0.002988
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000084
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000208 | Grad Max: 0.001066
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000336 | Grad Max: 0.001388
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004581 | Grad Max: 0.004581
[GRADIENT NORM TOTAL] 3.2603

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.267
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082134  0.49178657] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 725/1323 | B: 620/1236 | C: 611/1437
[LOSS Ex1] A: 0.62748 | B: 0.60829 | C: 0.60536
[LOGITS Ex2 A] Mean Abs: 2.268 | Max: 10.262
[LOSS Ex2] A: 0.09512 | B: 0.28910 | C: 0.21295
** [JOINT LOSS] ** : 0.812767
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003111 | Grad Max: 0.112634
  -> Layer: shared_layers.0.bias | Grad Mean: 0.271123 | Grad Max: 1.133628
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.005424
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003920 | Grad Max: 0.003920
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001730 | Grad Max: 0.211193
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031696 | Grad Max: 1.165500
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.007121
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018263 | Grad Max: 0.111806
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000302
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003491 | Grad Max: 0.008170
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000194
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000813 | Grad Max: 0.002940
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000411 | Grad Max: 0.001830
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011048 | Grad Max: 0.011048
[GRADIENT NORM TOTAL] 5.6174

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.192
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5053226  0.49467742] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 720/1328 | B: 668/1380 | C: 629/1419
[LOSS Ex1] A: 0.62414 | B: 0.60832 | C: 0.61101
[LOGITS Ex2 A] Mean Abs: 2.286 | Max: 5.407
[LOSS Ex2] A: 0.09620 | B: 0.30875 | C: 0.23619
** [JOINT LOSS] ** : 0.828207
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003965 | Grad Max: 0.093667
  -> Layer: shared_layers.0.bias | Grad Mean: 0.272304 | Grad Max: 1.341745
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.005930
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000242 | Grad Max: 0.000242
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001868 | Grad Max: 0.250564
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034308 | Grad Max: 1.410381
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.006546
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016175 | Grad Max: 0.096514
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000334
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003131 | Grad Max: 0.008215
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000204
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000707 | Grad Max: 0.002880
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000305 | Grad Max: 0.001392
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007908 | Grad Max: 0.007908
[GRADIENT NORM TOTAL] 6.3276

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.228
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5106332 0.4893668] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.087
[MASKS] A(Pass/Fail): 723/1325 | B: 650/1398 | C: 623/1425
[LOSS Ex1] A: 0.62309 | B: 0.60866 | C: 0.60449
[LOGITS Ex2 A] Mean Abs: 2.236 | Max: 6.615
[LOSS Ex2] A: 0.10849 | B: 0.30609 | C: 0.20847
** [JOINT LOSS] ** : 0.819762
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003826 | Grad Max: 0.095459
  -> Layer: shared_layers.0.bias | Grad Mean: 0.273395 | Grad Max: 1.291165
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.006497
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001422 | Grad Max: 0.001422
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001907 | Grad Max: 0.310626
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034639 | Grad Max: 1.728630
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000171 | Grad Max: 0.005694
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016519 | Grad Max: 0.080058
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000329
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003275 | Grad Max: 0.008145
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000228
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000801 | Grad Max: 0.003120
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000414 | Grad Max: 0.001550
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010056 | Grad Max: 0.010056
[GRADIENT NORM TOTAL] 6.5568

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.018
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50064445 0.49935552] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.086
[MASKS] A(Pass/Fail): 690/1358 | B: 663/1385 | C: 664/1384
[LOSS Ex1] A: 0.63207 | B: 0.60415 | C: 0.59864
[LOGITS Ex2 A] Mean Abs: 2.189 | Max: 6.019
[LOSS Ex2] A: 0.08846 | B: 0.28378 | C: 0.19041
** [JOINT LOSS] ** : 0.799171
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005138 | Grad Max: 0.152186
  -> Layer: shared_layers.0.bias | Grad Mean: 0.422047 | Grad Max: 2.079322
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.006306
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011225 | Grad Max: 0.011225
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002874 | Grad Max: 0.623386
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052891 | Grad Max: 3.495676
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.011409
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028616 | Grad Max: 0.173913
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000436
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005605 | Grad Max: 0.012103
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000357
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001334 | Grad Max: 0.004924
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000614 | Grad Max: 0.002520
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017820 | Grad Max: 0.017820
[GRADIENT NORM TOTAL] 10.0161

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.911
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.542105   0.45789495] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.083
[MASKS] A(Pass/Fail): 693/1355 | B: 620/1236 | C: 653/1395
[LOSS Ex1] A: 0.63279 | B: 0.60819 | C: 0.59786
[LOGITS Ex2 A] Mean Abs: 2.167 | Max: 6.495
[LOSS Ex2] A: 0.10707 | B: 0.30163 | C: 0.22103
** [JOINT LOSS] ** : 0.822856
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007071 | Grad Max: 0.193284
  -> Layer: shared_layers.0.bias | Grad Mean: 0.476618 | Grad Max: 2.378479
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.005183
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006809 | Grad Max: 0.006809
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003177 | Grad Max: 0.564853
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058820 | Grad Max: 3.168945
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000338 | Grad Max: 0.010087
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.032789 | Grad Max: 0.153971
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000536
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006434 | Grad Max: 0.013527
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000366
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001540 | Grad Max: 0.005100
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000763 | Grad Max: 0.002390
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021331 | Grad Max: 0.021331
[GRADIENT NORM TOTAL] 10.5030

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.144
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8213929  0.17860706] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.088
[MASKS] A(Pass/Fail): 750/1298 | B: 668/1380 | C: 599/1449
[LOSS Ex1] A: 0.62617 | B: 0.60822 | C: 0.60238
[LOGITS Ex2 A] Mean Abs: 2.261 | Max: 6.971
[LOSS Ex2] A: 0.09830 | B: 0.30763 | C: 0.20528
** [JOINT LOSS] ** : 0.815991
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004241 | Grad Max: 0.136017
  -> Layer: shared_layers.0.bias | Grad Mean: 0.273247 | Grad Max: 1.266063
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.005887
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010002 | Grad Max: 0.010002
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001937 | Grad Max: 0.380881
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035662 | Grad Max: 2.115374
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.006317
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018435 | Grad Max: 0.099078
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000330
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003711 | Grad Max: 0.008046
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000221
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000961 | Grad Max: 0.003211
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000450 | Grad Max: 0.001779
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013103 | Grad Max: 0.013103
[GRADIENT NORM TOTAL] 6.6631

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.270
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50062567 0.49937436] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.086
[MASKS] A(Pass/Fail): 725/1323 | B: 650/1398 | C: 618/1430
[LOSS Ex1] A: 0.63342 | B: 0.60855 | C: 0.60436
[LOGITS Ex2 A] Mean Abs: 2.280 | Max: 6.655
[LOSS Ex2] A: 0.08870 | B: 0.30836 | C: 0.19570
** [JOINT LOSS] ** : 0.813036
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002927 | Grad Max: 0.149127
  -> Layer: shared_layers.0.bias | Grad Mean: 0.308411 | Grad Max: 2.007177
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.005453
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001094 | Grad Max: 0.001094
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.325823
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040019 | Grad Max: 1.804268
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.007431
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018363 | Grad Max: 0.114990
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000269
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003329 | Grad Max: 0.007356
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000162
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000804 | Grad Max: 0.002239
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000477 | Grad Max: 0.001636
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010553 | Grad Max: 0.010553
[GRADIENT NORM TOTAL] 7.6352

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.896
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7468111  0.25318885] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.088
[MASKS] A(Pass/Fail): 713/1335 | B: 663/1385 | C: 661/1387
[LOSS Ex1] A: 0.62844 | B: 0.60404 | C: 0.59933
[LOGITS Ex2 A] Mean Abs: 2.225 | Max: 6.819
[LOSS Ex2] A: 0.10942 | B: 0.28749 | C: 0.19264
** [JOINT LOSS] ** : 0.807119
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003089 | Grad Max: 0.079800
  -> Layer: shared_layers.0.bias | Grad Mean: 0.215075 | Grad Max: 1.168709
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.005484
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002074 | Grad Max: 0.002074
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001169 | Grad Max: 0.658572
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020621 | Grad Max: 3.657030
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.004709
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008391 | Grad Max: 0.068943
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000202
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001599 | Grad Max: 0.004620
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000137
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000348 | Grad Max: 0.001940
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000392 | Grad Max: 0.001589
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004336 | Grad Max: 0.004336
[GRADIENT NORM TOTAL] 5.7597

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.040
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64418465 0.35581535] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.088
[MASKS] A(Pass/Fail): 590/1026 | B: 620/1236 | C: 635/1413
[LOSS Ex1] A: 0.62655 | B: 0.60807 | C: 0.60797
[LOGITS Ex2 A] Mean Abs: 2.264 | Max: 11.798
[LOSS Ex2] A: 0.09742 | B: 0.29243 | C: 0.20975
** [JOINT LOSS] ** : 0.814066
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003731 | Grad Max: 0.133300
  -> Layer: shared_layers.0.bias | Grad Mean: 0.395664 | Grad Max: 1.863499
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.005522
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002207 | Grad Max: 0.002207
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002376 | Grad Max: 0.748567
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.043880 | Grad Max: 4.148497
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000240 | Grad Max: 0.007482
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024024 | Grad Max: 0.121546
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000413
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004639 | Grad Max: 0.010448
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000266
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001115 | Grad Max: 0.004005
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000538 | Grad Max: 0.002058
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015758 | Grad Max: 0.015758
[GRADIENT NORM TOTAL] 9.1656

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.272
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082613 0.4917387] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 725/1323 | B: 668/1380 | C: 634/1414
[LOSS Ex1] A: 0.62731 | B: 0.60811 | C: 0.60496
[LOGITS Ex2 A] Mean Abs: 2.292 | Max: 10.014
[LOSS Ex2] A: 0.08766 | B: 0.31266 | C: 0.22497
** [JOINT LOSS] ** : 0.821891
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002312 | Grad Max: 0.095213
  -> Layer: shared_layers.0.bias | Grad Mean: 0.150131 | Grad Max: 1.252135
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.005258
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002382 | Grad Max: 0.002382
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001166 | Grad Max: 0.324423
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020922 | Grad Max: 1.806336
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.004164
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007942 | Grad Max: 0.062977
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000217
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001391 | Grad Max: 0.004855
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000120
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000314 | Grad Max: 0.001488
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000349 | Grad Max: 0.001111
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003178 | Grad Max: 0.003178
[GRADIENT NORM TOTAL] 4.5573

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.196
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5052098 0.4947902] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 720/1328 | B: 650/1398 | C: 636/1412
[LOSS Ex1] A: 0.62397 | B: 0.60843 | C: 0.60588
[LOGITS Ex2 A] Mean Abs: 2.296 | Max: 8.578
[LOSS Ex2] A: 0.10009 | B: 0.30331 | C: 0.21182
** [JOINT LOSS] ** : 0.817835
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003305 | Grad Max: 0.075037
  -> Layer: shared_layers.0.bias | Grad Mean: 0.210236 | Grad Max: 1.103128
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002114 | Grad Max: 0.006092
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000798 | Grad Max: 0.000798
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001311 | Grad Max: 0.339066
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023667 | Grad Max: 1.887164
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000098 | Grad Max: 0.004786
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009376 | Grad Max: 0.060975
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000234
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001847 | Grad Max: 0.005352
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000166
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000414 | Grad Max: 0.002190
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000330 | Grad Max: 0.001067
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004232 | Grad Max: 0.004232
[GRADIENT NORM TOTAL] 5.2590

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.233
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5107318  0.48926815] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.566 | Std: 0.087
[MASKS] A(Pass/Fail): 723/1325 | B: 664/1384 | C: 630/1418
[LOSS Ex1] A: 0.62292 | B: 0.60391 | C: 0.60640
[LOGITS Ex2 A] Mean Abs: 2.244 | Max: 7.036
[LOSS Ex2] A: 0.10812 | B: 0.28411 | C: 0.21968
** [JOINT LOSS] ** : 0.815050
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003683 | Grad Max: 0.124169
  -> Layer: shared_layers.0.bias | Grad Mean: 0.206025 | Grad Max: 1.340376
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.005917
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007475 | Grad Max: 0.007475
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001388 | Grad Max: 0.191351
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023807 | Grad Max: 1.054265
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000134 | Grad Max: 0.006126
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012424 | Grad Max: 0.077958
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000218
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002150 | Grad Max: 0.005745
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000151
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000554 | Grad Max: 0.001922
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000325 | Grad Max: 0.001485
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.008181 | Grad Max: 0.008181
[GRADIENT NORM TOTAL] 4.3625

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.021
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006232  0.49937674] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.563 | Std: 0.086
[MASKS] A(Pass/Fail): 691/1357 | B: 620/1236 | C: 439/937
[LOSS Ex1] A: 0.63191 | B: 0.60795 | C: 0.59653
[LOGITS Ex2 A] Mean Abs: 2.245 | Max: 5.978
[LOSS Ex2] A: 0.08788 | B: 0.28156 | C: 0.19016
** [JOINT LOSS] ** : 0.798660
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002878 | Grad Max: 0.097087
  -> Layer: shared_layers.0.bias | Grad Mean: 0.125730 | Grad Max: 0.844214
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005677
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003209 | Grad Max: 0.003209
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000915 | Grad Max: 0.306265
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015470 | Grad Max: 1.707743
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003442
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002941 | Grad Max: 0.039180
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000142
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000492 | Grad Max: 0.002449
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000120
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000179 | Grad Max: 0.001491
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000503 | Grad Max: 0.001474
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002246 | Grad Max: 0.002246
[GRADIENT NORM TOTAL] 3.7375

[EPOCH SUMMARY] Train Loss: 0.8139

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.7985 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.7990 -> New: 0.7985)

############################## EPOCH 171/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.914
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5421049  0.45789507] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.083
[MASKS] A(Pass/Fail): 693/1355 | B: 668/1380 | C: 653/1395
[LOSS Ex1] A: 0.63263 | B: 0.60798 | C: 0.60139
[LOGITS Ex2 A] Mean Abs: 2.241 | Max: 6.678
[LOSS Ex2] A: 0.10236 | B: 0.30862 | C: 0.20202
** [JOINT LOSS] ** : 0.818335
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003188 | Grad Max: 0.103001
  -> Layer: shared_layers.0.bias | Grad Mean: 0.094367 | Grad Max: 0.364185
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.005377
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003095 | Grad Max: 0.003095
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001005 | Grad Max: 0.453149
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017189 | Grad Max: 2.544601
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000066 | Grad Max: 0.004491
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005221 | Grad Max: 0.056732
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000160
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000846 | Grad Max: 0.003357
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000080
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000203 | Grad Max: 0.001007
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000338 | Grad Max: 0.001040
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002353 | Grad Max: 0.002353
[GRADIENT NORM TOTAL] 4.0281

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.149
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.82225156 0.17774843] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.088
[MASKS] A(Pass/Fail): 750/1298 | B: 650/1398 | C: 594/1454
[LOSS Ex1] A: 0.62599 | B: 0.60829 | C: 0.60481
[LOGITS Ex2 A] Mean Abs: 2.281 | Max: 7.284
[LOSS Ex2] A: 0.09363 | B: 0.29899 | C: 0.19865
** [JOINT LOSS] ** : 0.810119
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001829 | Grad Max: 0.047382
  -> Layer: shared_layers.0.bias | Grad Mean: 0.079020 | Grad Max: 0.675747
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.005340
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002674 | Grad Max: 0.002674
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000667 | Grad Max: 0.173515
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.011493 | Grad Max: 0.967191
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.002482
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002916 | Grad Max: 0.025657
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000162
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000601 | Grad Max: 0.002650
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000083
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000143 | Grad Max: 0.000769
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000397 | Grad Max: 0.001219
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001860 | Grad Max: 0.001860
[GRADIENT NORM TOTAL] 2.4903

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.277
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006153 0.4993847] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.086
[MASKS] A(Pass/Fail): 725/1323 | B: 665/1383 | C: 639/1409
[LOSS Ex1] A: 0.63325 | B: 0.60375 | C: 0.60142
[LOGITS Ex2 A] Mean Abs: 2.319 | Max: 7.261
[LOSS Ex2] A: 0.08404 | B: 0.28121 | C: 0.20843
** [JOINT LOSS] ** : 0.804030
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002718 | Grad Max: 0.060754
  -> Layer: shared_layers.0.bias | Grad Mean: 0.138705 | Grad Max: 0.681695
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.005223
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001963 | Grad Max: 0.001963
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000900 | Grad Max: 0.561943
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016127 | Grad Max: 3.116124
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.002040
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005227 | Grad Max: 0.028963
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000213
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000955 | Grad Max: 0.003617
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000099
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000217 | Grad Max: 0.001079
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000432 | Grad Max: 0.001130
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002059 | Grad Max: 0.002059
[GRADIENT NORM TOTAL] 4.6548

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.900
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.74756616 0.25243384] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.088
[MASKS] A(Pass/Fail): 713/1335 | B: 621/1235 | C: 631/1417
[LOSS Ex1] A: 0.62824 | B: 0.60777 | C: 0.60703
[LOGITS Ex2 A] Mean Abs: 2.286 | Max: 6.639
[LOSS Ex2] A: 0.10638 | B: 0.28649 | C: 0.20204
** [JOINT LOSS] ** : 0.812652
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003742 | Grad Max: 0.086888
  -> Layer: shared_layers.0.bias | Grad Mean: 0.223360 | Grad Max: 1.148190
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.005298
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003145 | Grad Max: 0.003145
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001403 | Grad Max: 0.589452
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.024743 | Grad Max: 3.266058
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.004282
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.008784 | Grad Max: 0.051532
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000168
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001351 | Grad Max: 0.004459
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000111
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000299 | Grad Max: 0.001298
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000326 | Grad Max: 0.001170
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003465 | Grad Max: 0.003465
[GRADIENT NORM TOTAL] 5.8410

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.045
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6446194  0.35538054] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.088
[MASKS] A(Pass/Fail): 590/1026 | B: 669/1379 | C: 653/1395
[LOSS Ex1] A: 0.62634 | B: 0.60780 | C: 0.60351
[LOGITS Ex2 A] Mean Abs: 2.344 | Max: 10.711
[LOSS Ex2] A: 0.09524 | B: 0.29986 | C: 0.20623
** [JOINT LOSS] ** : 0.812994
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005853 | Grad Max: 0.144992
  -> Layer: shared_layers.0.bias | Grad Mean: 0.251193 | Grad Max: 1.667842
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.005910
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003488 | Grad Max: 0.003488
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001721 | Grad Max: 0.285859
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031063 | Grad Max: 1.589072
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.005875
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015981 | Grad Max: 0.075735
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000384
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003293 | Grad Max: 0.007997
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000195
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000771 | Grad Max: 0.002544
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000381 | Grad Max: 0.001634
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009795 | Grad Max: 0.009795
[GRADIENT NORM TOTAL] 5.6695

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.279
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50827724 0.49172276] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.089
[MASKS] A(Pass/Fail): 727/1321 | B: 650/1398 | C: 639/1409
[LOSS Ex1] A: 0.62709 | B: 0.60811 | C: 0.60183
[LOGITS Ex2 A] Mean Abs: 2.311 | Max: 9.693
[LOSS Ex2] A: 0.09128 | B: 0.30001 | C: 0.20887
** [JOINT LOSS] ** : 0.812394
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002780 | Grad Max: 0.061787
  -> Layer: shared_layers.0.bias | Grad Mean: 0.165926 | Grad Max: 0.757587
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.005281
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000471 | Grad Max: 0.000471
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001329 | Grad Max: 0.177168
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023965 | Grad Max: 0.986357
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.005903
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.010364 | Grad Max: 0.068414
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000223
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001865 | Grad Max: 0.004912
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000139
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000447 | Grad Max: 0.001498
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000316 | Grad Max: 0.001372
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007083 | Grad Max: 0.007083
[GRADIENT NORM TOTAL] 4.2106

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.202
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5051736 0.4948264] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 720/1328 | B: 665/1383 | C: 633/1415
[LOSS Ex1] A: 0.62374 | B: 0.60357 | C: 0.60260
[LOGITS Ex2 A] Mean Abs: 2.314 | Max: 6.760
[LOSS Ex2] A: 0.09689 | B: 0.28694 | C: 0.22522
** [JOINT LOSS] ** : 0.812984
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.086695
  -> Layer: shared_layers.0.bias | Grad Mean: 0.100160 | Grad Max: 0.510429
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.006238
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005955 | Grad Max: 0.005955
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000753 | Grad Max: 0.210069
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013305 | Grad Max: 1.160851
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.002793
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004856 | Grad Max: 0.028721
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000158
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000934 | Grad Max: 0.003427
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000097
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000274 | Grad Max: 0.001438
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000236 | Grad Max: 0.001061
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004855 | Grad Max: 0.004855
[GRADIENT NORM TOTAL] 2.8968

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.239
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51083744 0.4891625 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 724/1324 | B: 621/1235 | C: 652/1396
[LOSS Ex1] A: 0.62267 | B: 0.60758 | C: 0.59931
[LOGITS Ex2 A] Mean Abs: 2.323 | Max: 6.978
[LOSS Ex2] A: 0.09701 | B: 0.28267 | C: 0.20271
** [JOINT LOSS] ** : 0.803986
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003497 | Grad Max: 0.174659
  -> Layer: shared_layers.0.bias | Grad Mean: 0.386331 | Grad Max: 2.338924
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002203 | Grad Max: 0.006178
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000901 | Grad Max: 0.000901
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002335 | Grad Max: 0.620205
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042604 | Grad Max: 3.467451
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000219 | Grad Max: 0.006529
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.021839 | Grad Max: 0.108675
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000350
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004237 | Grad Max: 0.009637
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000228
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001024 | Grad Max: 0.003292
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000438 | Grad Max: 0.001871
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013069 | Grad Max: 0.013069
[GRADIENT NORM TOTAL] 8.9340

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.026
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005842 0.4994158] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.087
[MASKS] A(Pass/Fail): 692/1356 | B: 669/1379 | C: 659/1389
[LOSS Ex1] A: 0.63167 | B: 0.60762 | C: 0.59924
[LOGITS Ex2 A] Mean Abs: 2.276 | Max: 7.005
[LOSS Ex2] A: 0.10108 | B: 0.30442 | C: 0.20411
** [JOINT LOSS] ** : 0.816045
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003175 | Grad Max: 0.093630
  -> Layer: shared_layers.0.bias | Grad Mean: 0.151761 | Grad Max: 0.712406
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.005390
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005808 | Grad Max: 0.005808
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001122 | Grad Max: 0.424390
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019491 | Grad Max: 2.396086
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.003077
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004061 | Grad Max: 0.038441
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000113
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000657 | Grad Max: 0.003043
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000097
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000164 | Grad Max: 0.000989
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000315 | Grad Max: 0.000967
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001413 | Grad Max: 0.001413
[GRADIENT NORM TOTAL] 4.6399

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.919
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54206496 0.45793504] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.083
[MASKS] A(Pass/Fail): 693/1355 | B: 650/1398 | C: 653/1395
[LOSS Ex1] A: 0.63241 | B: 0.60791 | C: 0.60214
[LOGITS Ex2 A] Mean Abs: 2.270 | Max: 6.404
[LOSS Ex2] A: 0.10195 | B: 0.30255 | C: 0.21424
** [JOINT LOSS] ** : 0.820401
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006047 | Grad Max: 0.164568
  -> Layer: shared_layers.0.bias | Grad Mean: 0.209480 | Grad Max: 1.130645
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.005638
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009551 | Grad Max: 0.009551
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001461 | Grad Max: 0.569800
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026175 | Grad Max: 3.184562
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000138 | Grad Max: 0.004643
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012366 | Grad Max: 0.060410
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000323
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002658 | Grad Max: 0.006433
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000185
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000671 | Grad Max: 0.002342
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000433 | Grad Max: 0.001930
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010903 | Grad Max: 0.010903
[GRADIENT NORM TOTAL] 5.7254

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.155
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8234019  0.17659804] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.088
[MASKS] A(Pass/Fail): 750/1298 | B: 665/1383 | C: 613/1435
[LOSS Ex1] A: 0.62575 | B: 0.60337 | C: 0.60665
[LOGITS Ex2 A] Mean Abs: 2.306 | Max: 6.980
[LOSS Ex2] A: 0.09148 | B: 0.27537 | C: 0.23455
** [JOINT LOSS] ** : 0.812388
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003343 | Grad Max: 0.088480
  -> Layer: shared_layers.0.bias | Grad Mean: 0.129182 | Grad Max: 0.785544
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002109 | Grad Max: 0.005540
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000509 | Grad Max: 0.000509
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000887 | Grad Max: 0.517749
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015294 | Grad Max: 2.878430
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.001932
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002742 | Grad Max: 0.022657
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000151
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000519 | Grad Max: 0.003035
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000092
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000128 | Grad Max: 0.000994
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000307 | Grad Max: 0.001012
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000699 | Grad Max: 0.000699
[GRADIENT NORM TOTAL] 4.4812

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.283
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50060576 0.49939418] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.087
[MASKS] A(Pass/Fail): 725/1323 | B: 623/1233 | C: 688/1360
[LOSS Ex1] A: 0.63302 | B: 0.60738 | C: 0.59785
[LOGITS Ex2 A] Mean Abs: 2.336 | Max: 5.854
[LOSS Ex2] A: 0.08158 | B: 0.27941 | C: 0.20448
** [JOINT LOSS] ** : 0.801240
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002416 | Grad Max: 0.068023
  -> Layer: shared_layers.0.bias | Grad Mean: 0.137366 | Grad Max: 0.802258
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.006178
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003084 | Grad Max: 0.003084
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000843 | Grad Max: 0.514485
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.014832 | Grad Max: 2.859023
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.002906
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003418 | Grad Max: 0.028509
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000137
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000537 | Grad Max: 0.002528
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000069
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000165 | Grad Max: 0.000854
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000404 | Grad Max: 0.001063
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001475 | Grad Max: 0.001475
[GRADIENT NORM TOTAL] 4.4045

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.905
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7484275 0.2515725] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.088
[MASKS] A(Pass/Fail): 713/1335 | B: 670/1378 | C: 640/1408
[LOSS Ex1] A: 0.62800 | B: 0.60743 | C: 0.60545
[LOGITS Ex2 A] Mean Abs: 2.313 | Max: 7.430
[LOSS Ex2] A: 0.11120 | B: 0.30203 | C: 0.21545
** [JOINT LOSS] ** : 0.823189
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002500 | Grad Max: 0.060090
  -> Layer: shared_layers.0.bias | Grad Mean: 0.100744 | Grad Max: 0.474059
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.005588
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004879 | Grad Max: 0.004879
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000821 | Grad Max: 0.216559
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013818 | Grad Max: 1.196814
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.002941
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002233 | Grad Max: 0.029770
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000125
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000382 | Grad Max: 0.002623
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000082
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000093 | Grad Max: 0.000777
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000250 | Grad Max: 0.000750
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000611 | Grad Max: 0.000611
[GRADIENT NORM TOTAL] 3.1258

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.051
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6450479  0.35495207] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.088
[MASKS] A(Pass/Fail): 590/1026 | B: 652/1396 | C: 410/966
[LOSS Ex1] A: 0.62609 | B: 0.60772 | C: 0.60208
[LOGITS Ex2 A] Mean Abs: 2.348 | Max: 8.980
[LOSS Ex2] A: 0.08850 | B: 0.30039 | C: 0.22049
** [JOINT LOSS] ** : 0.815092
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002715 | Grad Max: 0.086382
  -> Layer: shared_layers.0.bias | Grad Mean: 0.165364 | Grad Max: 1.143511
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.006745
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014752 | Grad Max: 0.014752
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001005 | Grad Max: 0.239619
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017524 | Grad Max: 1.310699
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.003006
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002544 | Grad Max: 0.019629
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000150
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000406 | Grad Max: 0.002914
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000069
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000104 | Grad Max: 0.000843
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000328 | Grad Max: 0.001161
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001811 | Grad Max: 0.001811
[GRADIENT NORM TOTAL] 4.3260

[EPOCH SUMMARY] Train Loss: 0.8126

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.7946 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.7985 -> New: 0.7946)

############################## EPOCH 172/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.285
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50825983 0.4917402 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.089
[MASKS] A(Pass/Fail): 727/1321 | B: 666/1382 | C: 643/1405
[LOSS Ex1] A: 0.62683 | B: 0.60317 | C: 0.60505
[LOGITS Ex2 A] Mean Abs: 2.359 | Max: 10.241
[LOSS Ex2] A: 0.08326 | B: 0.28046 | C: 0.19356
** [JOINT LOSS] ** : 0.797444
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.088893
  -> Layer: shared_layers.0.bias | Grad Mean: 0.132837 | Grad Max: 0.776347
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.005513
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002712 | Grad Max: 0.002712
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000878 | Grad Max: 0.358079
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.015519 | Grad Max: 1.988845
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000037 | Grad Max: 0.002285
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002130 | Grad Max: 0.018823
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000005 | Grad Max: 0.000117
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000372 | Grad Max: 0.002461
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000088
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000108 | Grad Max: 0.000738
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000457 | Grad Max: 0.001277
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000424 | Grad Max: 0.000424
[GRADIENT NORM TOTAL] 3.8705

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.208
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50506747 0.49493256] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 720/1328 | B: 623/1233 | C: 607/1441
[LOSS Ex1] A: 0.62348 | B: 0.60717 | C: 0.60588
[LOGITS Ex2 A] Mean Abs: 2.361 | Max: 7.271
[LOSS Ex2] A: 0.09597 | B: 0.28248 | C: 0.20764
** [JOINT LOSS] ** : 0.807541
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002018 | Grad Max: 0.072332
  -> Layer: shared_layers.0.bias | Grad Mean: 0.177001 | Grad Max: 1.103562
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002243 | Grad Max: 0.006528
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009480 | Grad Max: 0.009480
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001236 | Grad Max: 0.334792
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.022269 | Grad Max: 1.863441
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000071 | Grad Max: 0.005079
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.006693 | Grad Max: 0.059247
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000200
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001142 | Grad Max: 0.004087
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000108
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000225 | Grad Max: 0.001077
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000323 | Grad Max: 0.001192
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002820 | Grad Max: 0.002820
[GRADIENT NORM TOTAL] 4.7478

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.245
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51092565 0.48907435] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 724/1324 | B: 671/1377 | C: 654/1394
[LOSS Ex1] A: 0.62241 | B: 0.60722 | C: 0.60010
[LOGITS Ex2 A] Mean Abs: 2.366 | Max: 6.768
[LOSS Ex2] A: 0.10500 | B: 0.30383 | C: 0.21250
** [JOINT LOSS] ** : 0.817023
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002694 | Grad Max: 0.076627
  -> Layer: shared_layers.0.bias | Grad Mean: 0.145184 | Grad Max: 0.625474
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.006004
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001418 | Grad Max: 0.001418
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001326 | Grad Max: 0.435810
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023593 | Grad Max: 2.428090
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000113 | Grad Max: 0.006256
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011010 | Grad Max: 0.087691
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000219
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002204 | Grad Max: 0.005955
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000156
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000523 | Grad Max: 0.002047
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000262 | Grad Max: 0.001106
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005810 | Grad Max: 0.005810
[GRADIENT NORM TOTAL] 4.9582

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.031
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005005 0.4994995] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.087
[MASKS] A(Pass/Fail): 691/1357 | B: 654/1394 | C: 599/1449
[LOSS Ex1] A: 0.63142 | B: 0.60750 | C: 0.60673
[LOGITS Ex2 A] Mean Abs: 2.325 | Max: 6.023
[LOSS Ex2] A: 0.09777 | B: 0.29901 | C: 0.20989
** [JOINT LOSS] ** : 0.817436
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003783 | Grad Max: 0.121533
  -> Layer: shared_layers.0.bias | Grad Mean: 0.178976 | Grad Max: 0.934758
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.006137
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011988 | Grad Max: 0.011988
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001025 | Grad Max: 0.525363
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017347 | Grad Max: 2.939861
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.003420
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004695 | Grad Max: 0.033286
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000173
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001085 | Grad Max: 0.003736
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000101
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000307 | Grad Max: 0.001141
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000301 | Grad Max: 0.001357
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006256 | Grad Max: 0.006256
[GRADIENT NORM TOTAL] 4.9799

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.924
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.542049   0.45795098] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.562 | Std: 0.084
[MASKS] A(Pass/Fail): 693/1355 | B: 667/1381 | C: 656/1392
[LOSS Ex1] A: 0.63216 | B: 0.60295 | C: 0.60070
[LOGITS Ex2 A] Mean Abs: 2.328 | Max: 6.183
[LOSS Ex2] A: 0.09784 | B: 0.27477 | C: 0.20605
** [JOINT LOSS] ** : 0.804823
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003550 | Grad Max: 0.124347
  -> Layer: shared_layers.0.bias | Grad Mean: 0.096788 | Grad Max: 0.339987
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.005286
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005579 | Grad Max: 0.005579
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000814 | Grad Max: 0.181765
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.013727 | Grad Max: 1.004215
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.002581
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.004232 | Grad Max: 0.027731
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000198
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000938 | Grad Max: 0.003886
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000090
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000227 | Grad Max: 0.000998
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000350 | Grad Max: 0.001299
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.003476 | Grad Max: 0.003476
[GRADIENT NORM TOTAL] 2.7889

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.161
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8246873 0.1753127] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.088
[MASKS] A(Pass/Fail): 750/1298 | B: 623/1233 | C: 626/1422
[LOSS Ex1] A: 0.62548 | B: 0.60693 | C: 0.60173
[LOGITS Ex2 A] Mean Abs: 2.400 | Max: 7.662
[LOSS Ex2] A: 0.09756 | B: 0.28958 | C: 0.20750
** [JOINT LOSS] ** : 0.809594
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007012 | Grad Max: 0.185278
  -> Layer: shared_layers.0.bias | Grad Mean: 0.372080 | Grad Max: 1.649750
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005707
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000468 | Grad Max: 0.000468
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002594 | Grad Max: 0.562041
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047005 | Grad Max: 3.137871
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000262 | Grad Max: 0.008213
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024968 | Grad Max: 0.127469
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000410
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005007 | Grad Max: 0.010876
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000268
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001224 | Grad Max: 0.003818
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000537 | Grad Max: 0.001970
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016304 | Grad Max: 0.016304
[GRADIENT NORM TOTAL] 8.6520

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.291
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50059986 0.4994001 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.087
[MASKS] A(Pass/Fail): 725/1323 | B: 672/1376 | C: 684/1364
[LOSS Ex1] A: 0.63275 | B: 0.60700 | C: 0.59733
[LOGITS Ex2 A] Mean Abs: 2.380 | Max: 7.111
[LOSS Ex2] A: 0.08244 | B: 0.30697 | C: 0.20025
** [JOINT LOSS] ** : 0.808912
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004030 | Grad Max: 0.127049
  -> Layer: shared_layers.0.bias | Grad Mean: 0.176357 | Grad Max: 0.770244
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.005463
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001446 | Grad Max: 0.001446
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001336 | Grad Max: 0.661899
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023189 | Grad Max: 3.664516
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.003375
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003082 | Grad Max: 0.038339
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000145
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000422 | Grad Max: 0.002536
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000063
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000103 | Grad Max: 0.000501
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000321 | Grad Max: 0.000896
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001188 | Grad Max: 0.001188
[GRADIENT NORM TOTAL] 6.0130

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 0.910
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7495055 0.2504945] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.088
[MASKS] A(Pass/Fail): 713/1335 | B: 654/1394 | C: 609/1439
[LOSS Ex1] A: 0.62771 | B: 0.60728 | C: 0.60725
[LOGITS Ex2 A] Mean Abs: 2.350 | Max: 7.321
[LOSS Ex2] A: 0.10817 | B: 0.29889 | C: 0.22494
** [JOINT LOSS] ** : 0.824750
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003233 | Grad Max: 0.112172
  -> Layer: shared_layers.0.bias | Grad Mean: 0.210435 | Grad Max: 1.302880
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002006 | Grad Max: 0.005293
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005176 | Grad Max: 0.005176
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001550 | Grad Max: 0.265168
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027758 | Grad Max: 1.433430
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000141 | Grad Max: 0.005802
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013676 | Grad Max: 0.088074
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000253
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002409 | Grad Max: 0.006349
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000148
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000609 | Grad Max: 0.001794
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000311 | Grad Max: 0.001255
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009492 | Grad Max: 0.009492
[GRADIENT NORM TOTAL] 5.1380

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.058
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64569914 0.35430086] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.568 | Std: 0.089
[MASKS] A(Pass/Fail): 590/1026 | B: 667/1381 | C: 636/1412
[LOSS Ex1] A: 0.62580 | B: 0.60273 | C: 0.60147
[LOGITS Ex2 A] Mean Abs: 2.431 | Max: 9.096
[LOSS Ex2] A: 0.09439 | B: 0.28739 | C: 0.19880
** [JOINT LOSS] ** : 0.803523
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003688 | Grad Max: 0.116129
  -> Layer: shared_layers.0.bias | Grad Mean: 0.264394 | Grad Max: 1.495054
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002203 | Grad Max: 0.005888
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009414 | Grad Max: 0.009415
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001731 | Grad Max: 0.337491
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031398 | Grad Max: 1.885916
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000102 | Grad Max: 0.004154
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009362 | Grad Max: 0.057009
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000216
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001873 | Grad Max: 0.005111
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000120
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000442 | Grad Max: 0.001455
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000316 | Grad Max: 0.001152
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005276 | Grad Max: 0.005277
[GRADIENT NORM TOTAL] 6.3547

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.293
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082816 0.4917184] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.089
[MASKS] A(Pass/Fail): 727/1321 | B: 624/1232 | C: 656/1392
[LOSS Ex1] A: 0.62654 | B: 0.60672 | C: 0.59410
[LOGITS Ex2 A] Mean Abs: 2.407 | Max: 8.667
[LOSS Ex2] A: 0.07865 | B: 0.28832 | C: 0.18457
** [JOINT LOSS] ** : 0.792968
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002636 | Grad Max: 0.075436
  -> Layer: shared_layers.0.bias | Grad Mean: 0.193137 | Grad Max: 1.029533
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002170 | Grad Max: 0.005375
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003707 | Grad Max: 0.003707
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001330 | Grad Max: 0.472169
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023576 | Grad Max: 2.629771
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.004078
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005903 | Grad Max: 0.056526
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000209
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001223 | Grad Max: 0.004566
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000110
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000308 | Grad Max: 0.001401
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001189
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004613 | Grad Max: 0.004613
[GRADIENT NORM TOTAL] 5.4420

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.215
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5050559  0.49494413] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 720/1328 | B: 672/1376 | C: 653/1395
[LOSS Ex1] A: 0.62318 | B: 0.60680 | C: 0.60623
[LOGITS Ex2 A] Mean Abs: 2.381 | Max: 8.088
[LOSS Ex2] A: 0.09362 | B: 0.30458 | C: 0.21357
** [JOINT LOSS] ** : 0.815995
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004453 | Grad Max: 0.150462
  -> Layer: shared_layers.0.bias | Grad Mean: 0.326038 | Grad Max: 1.927025
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.006085
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003306 | Grad Max: 0.003306
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002014 | Grad Max: 0.647694
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.036248 | Grad Max: 3.616336
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000165 | Grad Max: 0.005386
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016194 | Grad Max: 0.095264
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000329
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003112 | Grad Max: 0.006742
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000189
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000774 | Grad Max: 0.002602
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000401 | Grad Max: 0.001553
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011968 | Grad Max: 0.011968
[GRADIENT NORM TOTAL] 7.8843

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.252
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5110809 0.4889191] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 724/1324 | B: 654/1394 | C: 682/1366
[LOSS Ex1] A: 0.62211 | B: 0.60708 | C: 0.59959
[LOGITS Ex2 A] Mean Abs: 2.384 | Max: 8.085
[LOSS Ex2] A: 0.09780 | B: 0.30012 | C: 0.20542
** [JOINT LOSS] ** : 0.810707
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002151 | Grad Max: 0.057097
  -> Layer: shared_layers.0.bias | Grad Mean: 0.145990 | Grad Max: 0.723725
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.005871
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004687 | Grad Max: 0.004687
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001068 | Grad Max: 0.484310
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.019121 | Grad Max: 2.675334
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.003850
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003442 | Grad Max: 0.044321
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000157
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000575 | Grad Max: 0.003139
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000079
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000115 | Grad Max: 0.000704
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000262 | Grad Max: 0.000859
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000190 | Grad Max: 0.000190
[GRADIENT NORM TOTAL] 5.0623

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.037
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005007 0.4994993] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.087
[MASKS] A(Pass/Fail): 691/1357 | B: 667/1381 | C: 653/1395
[LOSS Ex1] A: 0.63112 | B: 0.60253 | C: 0.59821
[LOGITS Ex2 A] Mean Abs: 2.338 | Max: 6.049
[LOSS Ex2] A: 0.09320 | B: 0.26857 | C: 0.22304
** [JOINT LOSS] ** : 0.805557
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003390 | Grad Max: 0.114803
  -> Layer: shared_layers.0.bias | Grad Mean: 0.249121 | Grad Max: 1.499751
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005247
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000391 | Grad Max: 0.000391
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001714 | Grad Max: 0.217855
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.030513 | Grad Max: 1.210678
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.005468
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.014055 | Grad Max: 0.089696
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000219
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002455 | Grad Max: 0.005649
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000165
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000563 | Grad Max: 0.002173
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000352 | Grad Max: 0.001323
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006647 | Grad Max: 0.006647
[GRADIENT NORM TOTAL] 5.7072

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.930
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5420251 0.4579749] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.084
[MASKS] A(Pass/Fail): 693/1355 | B: 624/1232 | C: 431/945
[LOSS Ex1] A: 0.63188 | B: 0.60653 | C: 0.60182
[LOGITS Ex2 A] Mean Abs: 2.292 | Max: 6.716
[LOSS Ex2] A: 0.10084 | B: 0.28649 | C: 0.22106
** [JOINT LOSS] ** : 0.816211
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005758 | Grad Max: 0.148356
  -> Layer: shared_layers.0.bias | Grad Mean: 0.440257 | Grad Max: 1.900964
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.005629
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004373 | Grad Max: 0.004373
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002831 | Grad Max: 0.348480
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052028 | Grad Max: 1.933696
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.008635
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029759 | Grad Max: 0.153114
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000507
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005640 | Grad Max: 0.012961
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000320
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001278 | Grad Max: 0.004376
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000612 | Grad Max: 0.001868
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017045 | Grad Max: 0.017045
[GRADIENT NORM TOTAL] 9.2758

[EPOCH SUMMARY] Train Loss: 0.8095

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.7921 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.7946 -> New: 0.7921)

############################## EPOCH 173/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.168
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8258951  0.17410488] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.089
[MASKS] A(Pass/Fail): 750/1298 | B: 672/1376 | C: 661/1387
[LOSS Ex1] A: 0.62519 | B: 0.60662 | C: 0.59474
[LOGITS Ex2 A] Mean Abs: 2.388 | Max: 7.161
[LOSS Ex2] A: 0.09445 | B: 0.29393 | C: 0.19434
** [JOINT LOSS] ** : 0.803090
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004448 | Grad Max: 0.139613
  -> Layer: shared_layers.0.bias | Grad Mean: 0.152607 | Grad Max: 0.914248
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002214 | Grad Max: 0.006196
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005968 | Grad Max: 0.005968
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001225 | Grad Max: 0.203034
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020483 | Grad Max: 1.121511
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.002577
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003199 | Grad Max: 0.038052
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000178
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000416 | Grad Max: 0.002719
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000081
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000101 | Grad Max: 0.000827
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000342 | Grad Max: 0.001115
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001111 | Grad Max: 0.001111
[GRADIENT NORM TOTAL] 4.2865

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.298
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006913  0.49930874] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.087
[MASKS] A(Pass/Fail): 726/1322 | B: 654/1394 | C: 654/1394
[LOSS Ex1] A: 0.63249 | B: 0.60690 | C: 0.59669
[LOGITS Ex2 A] Mean Abs: 2.406 | Max: 7.352
[LOSS Ex2] A: 0.08458 | B: 0.29795 | C: 0.17639
** [JOINT LOSS] ** : 0.798331
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004389 | Grad Max: 0.157060
  -> Layer: shared_layers.0.bias | Grad Mean: 0.422676 | Grad Max: 2.129873
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.005418
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006272 | Grad Max: 0.006272
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002795 | Grad Max: 0.536297
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051515 | Grad Max: 2.990288
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000250 | Grad Max: 0.008156
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025069 | Grad Max: 0.137210
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000371
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004846 | Grad Max: 0.011400
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000257
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001159 | Grad Max: 0.003870
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000503 | Grad Max: 0.002109
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015375 | Grad Max: 0.015375
[GRADIENT NORM TOTAL] 9.9766

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 0.915
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75030535 0.24969465] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.089
[MASKS] A(Pass/Fail): 713/1335 | B: 667/1381 | C: 628/1420
[LOSS Ex1] A: 0.62744 | B: 0.60236 | C: 0.60252
[LOGITS Ex2 A] Mean Abs: 2.357 | Max: 8.149
[LOSS Ex2] A: 0.10444 | B: 0.26986 | C: 0.20592
** [JOINT LOSS] ** : 0.804179
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002383 | Grad Max: 0.061710
  -> Layer: shared_layers.0.bias | Grad Mean: 0.160829 | Grad Max: 0.504162
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002109 | Grad Max: 0.005679
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001291 | Grad Max: 0.001291
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001039 | Grad Max: 0.551152
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.018399 | Grad Max: 3.043483
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000040 | Grad Max: 0.002550
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002594 | Grad Max: 0.027999
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000005 | Grad Max: 0.000120
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000436 | Grad Max: 0.002496
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000081
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000097 | Grad Max: 0.000675
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000355 | Grad Max: 0.001160
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000073 | Grad Max: 0.000073
[GRADIENT NORM TOTAL] 5.1023

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.064
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64606684 0.35393313] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.089
[MASKS] A(Pass/Fail): 590/1026 | B: 624/1232 | C: 682/1366
[LOSS Ex1] A: 0.62552 | B: 0.60635 | C: 0.59694
[LOGITS Ex2 A] Mean Abs: 2.407 | Max: 11.860
[LOSS Ex2] A: 0.09794 | B: 0.29249 | C: 0.20100
** [JOINT LOSS] ** : 0.806747
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005374 | Grad Max: 0.139659
  -> Layer: shared_layers.0.bias | Grad Mean: 0.348334 | Grad Max: 1.310389
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002114 | Grad Max: 0.005503
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006715 | Grad Max: 0.006715
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002275 | Grad Max: 0.288180
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041835 | Grad Max: 1.609957
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.008245
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024052 | Grad Max: 0.127463
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000378
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004742 | Grad Max: 0.010052
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000287
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001098 | Grad Max: 0.003591
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000572 | Grad Max: 0.001811
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015500 | Grad Max: 0.015500
[GRADIENT NORM TOTAL] 7.0208

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.300
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083333  0.49166664] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.089
[MASKS] A(Pass/Fail): 728/1320 | B: 672/1376 | C: 688/1360
[LOSS Ex1] A: 0.62628 | B: 0.60645 | C: 0.59873
[LOGITS Ex2 A] Mean Abs: 2.414 | Max: 9.473
[LOSS Ex2] A: 0.07942 | B: 0.30209 | C: 0.22929
** [JOINT LOSS] ** : 0.814088
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003214 | Grad Max: 0.138558
  -> Layer: shared_layers.0.bias | Grad Mean: 0.238418 | Grad Max: 1.130593
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005881
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001118 | Grad Max: 0.001118
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001741 | Grad Max: 0.226426
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.031396 | Grad Max: 1.243055
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000156 | Grad Max: 0.006353
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015321 | Grad Max: 0.100120
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000258
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002691 | Grad Max: 0.006984
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000186
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000577 | Grad Max: 0.002620
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000269 | Grad Max: 0.001072
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.005503 | Grad Max: 0.005503
[GRADIENT NORM TOTAL] 5.6598

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.220
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5048422  0.49515775] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.089
[MASKS] A(Pass/Fail): 721/1327 | B: 654/1394 | C: 610/1438
[LOSS Ex1] A: 0.62292 | B: 0.60673 | C: 0.61050
[LOGITS Ex2 A] Mean Abs: 2.399 | Max: 8.406
[LOSS Ex2] A: 0.10007 | B: 0.29686 | C: 0.21721
** [JOINT LOSS] ** : 0.818102
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002575 | Grad Max: 0.068170
  -> Layer: shared_layers.0.bias | Grad Mean: 0.120285 | Grad Max: 0.868813
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005692
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003775 | Grad Max: 0.003775
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000933 | Grad Max: 0.219625
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.016010 | Grad Max: 1.212247
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.002780
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005247 | Grad Max: 0.032508
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000180
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001074 | Grad Max: 0.004608
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000110
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000236 | Grad Max: 0.001081
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000243 | Grad Max: 0.000912
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002555 | Grad Max: 0.002555
[GRADIENT NORM TOTAL] 3.3135

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51124424 0.48875576] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 724/1324 | B: 667/1381 | C: 595/1453
[LOSS Ex1] A: 0.62185 | B: 0.60219 | C: 0.60831
[LOGITS Ex2 A] Mean Abs: 2.359 | Max: 6.335
[LOSS Ex2] A: 0.10041 | B: 0.28143 | C: 0.21474
** [JOINT LOSS] ** : 0.809644
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004228 | Grad Max: 0.178674
  -> Layer: shared_layers.0.bias | Grad Mean: 0.414080 | Grad Max: 2.277112
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.005422
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000022 | Grad Max: 0.000022
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002792 | Grad Max: 0.594998
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.050532 | Grad Max: 3.279720
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000254 | Grad Max: 0.009365
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025597 | Grad Max: 0.147488
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000350
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004689 | Grad Max: 0.010157
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000282
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001121 | Grad Max: 0.003930
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000536 | Grad Max: 0.001809
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016120 | Grad Max: 0.016120
[GRADIENT NORM TOTAL] 9.9897

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.042
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004989 0.4995011] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.087
[MASKS] A(Pass/Fail): 691/1357 | B: 624/1232 | C: 633/1415
[LOSS Ex1] A: 0.63088 | B: 0.60618 | C: 0.59985
[LOGITS Ex2 A] Mean Abs: 2.352 | Max: 6.426
[LOSS Ex2] A: 0.09006 | B: 0.28683 | C: 0.19962
** [JOINT LOSS] ** : 0.804471
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002835 | Grad Max: 0.109447
  -> Layer: shared_layers.0.bias | Grad Mean: 0.272278 | Grad Max: 1.611253
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.005420
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003329 | Grad Max: 0.003329
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.491988
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034067 | Grad Max: 2.759935
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000163 | Grad Max: 0.005257
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.016526 | Grad Max: 0.081773
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000330
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003127 | Grad Max: 0.007990
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000195
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000685 | Grad Max: 0.002958
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000351 | Grad Max: 0.001652
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009253 | Grad Max: 0.009253
[GRADIENT NORM TOTAL] 6.8655

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.934
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5418613  0.45813867] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.084
[MASKS] A(Pass/Fail): 692/1356 | B: 673/1375 | C: 660/1388
[LOSS Ex1] A: 0.63165 | B: 0.60629 | C: 0.60310
[LOGITS Ex2 A] Mean Abs: 2.372 | Max: 6.515
[LOSS Ex2] A: 0.10106 | B: 0.30193 | C: 0.23335
** [JOINT LOSS] ** : 0.825793
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004287 | Grad Max: 0.224728
  -> Layer: shared_layers.0.bias | Grad Mean: 0.466636 | Grad Max: 2.553741
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005694
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000572 | Grad Max: 0.000572
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002889 | Grad Max: 0.409577
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053786 | Grad Max: 2.283252
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000303 | Grad Max: 0.010516
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030774 | Grad Max: 0.176105
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000429
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005618 | Grad Max: 0.013139
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000320
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001270 | Grad Max: 0.004826
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000488 | Grad Max: 0.001660
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015575 | Grad Max: 0.015575
[GRADIENT NORM TOTAL] 10.0492

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.174
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.82687914 0.1731208 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.089
[MASKS] A(Pass/Fail): 750/1298 | B: 654/1394 | C: 647/1401
[LOSS Ex1] A: 0.62497 | B: 0.60656 | C: 0.60326
[LOGITS Ex2 A] Mean Abs: 2.405 | Max: 9.016
[LOSS Ex2] A: 0.08698 | B: 0.29260 | C: 0.22652
** [JOINT LOSS] ** : 0.813629
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003399 | Grad Max: 0.124184
  -> Layer: shared_layers.0.bias | Grad Mean: 0.387512 | Grad Max: 1.615778
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.005461
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001370 | Grad Max: 0.001370
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002517 | Grad Max: 0.400374
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.046357 | Grad Max: 2.222160
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000242 | Grad Max: 0.009997
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024571 | Grad Max: 0.159562
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000320
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004505 | Grad Max: 0.009603
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000234
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001032 | Grad Max: 0.003586
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001622
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012861 | Grad Max: 0.012861
[GRADIENT NORM TOTAL] 8.8461

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.305
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50070107 0.49929893] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.088
[MASKS] A(Pass/Fail): 726/1322 | B: 667/1381 | C: 619/1429
[LOSS Ex1] A: 0.63227 | B: 0.60202 | C: 0.60081
[LOGITS Ex2 A] Mean Abs: 2.387 | Max: 7.196
[LOSS Ex2] A: 0.07957 | B: 0.27498 | C: 0.20567
** [JOINT LOSS] ** : 0.798440
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004516 | Grad Max: 0.159264
  -> Layer: shared_layers.0.bias | Grad Mean: 0.380336 | Grad Max: 1.987584
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.005884
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008561 | Grad Max: 0.008561
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002322 | Grad Max: 0.279119
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.042070 | Grad Max: 1.558046
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.007632
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.022741 | Grad Max: 0.125606
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000387
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004473 | Grad Max: 0.010318
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000247
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001044 | Grad Max: 0.003625
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000505 | Grad Max: 0.001983
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014824 | Grad Max: 0.014824
[GRADIENT NORM TOTAL] 7.8873

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.149 | Max: 0.919
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75103396 0.24896605] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.089
[MASKS] A(Pass/Fail): 713/1335 | B: 624/1232 | C: 675/1373
[LOSS Ex1] A: 0.62721 | B: 0.60601 | C: 0.59772
[LOGITS Ex2 A] Mean Abs: 2.357 | Max: 7.982
[LOSS Ex2] A: 0.10326 | B: 0.29276 | C: 0.19197
** [JOINT LOSS] ** : 0.806317
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005654 | Grad Max: 0.160190
  -> Layer: shared_layers.0.bias | Grad Mean: 0.461844 | Grad Max: 2.207402
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002091 | Grad Max: 0.005730
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002510 | Grad Max: 0.002510
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002726 | Grad Max: 0.642411
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.049560 | Grad Max: 3.565703
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000252 | Grad Max: 0.006581
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.025078 | Grad Max: 0.119957
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000433
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004791 | Grad Max: 0.010732
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000254
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001068 | Grad Max: 0.004003
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000496 | Grad Max: 0.001836
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014103 | Grad Max: 0.014103
[GRADIENT NORM TOTAL] 10.0062

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.069
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64633965 0.35366032] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.089
[MASKS] A(Pass/Fail): 590/1026 | B: 673/1375 | C: 656/1392
[LOSS Ex1] A: 0.62531 | B: 0.60614 | C: 0.59688
[LOGITS Ex2 A] Mean Abs: 2.449 | Max: 9.995
[LOSS Ex2] A: 0.08980 | B: 0.30120 | C: 0.18938
** [JOINT LOSS] ** : 0.802903
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002776 | Grad Max: 0.098225
  -> Layer: shared_layers.0.bias | Grad Mean: 0.204681 | Grad Max: 1.158149
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.006485
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013640 | Grad Max: 0.013640
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001500 | Grad Max: 0.324999
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026844 | Grad Max: 1.803301
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000119 | Grad Max: 0.006004
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011645 | Grad Max: 0.070321
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000259
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002212 | Grad Max: 0.006130
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000170
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000507 | Grad Max: 0.001999
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000310 | Grad Max: 0.001203
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006012 | Grad Max: 0.006012
[GRADIENT NORM TOTAL] 5.2999

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.307
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083018  0.49169818] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 729/1319 | B: 654/1394 | C: 407/969
[LOSS Ex1] A: 0.62606 | B: 0.60641 | C: 0.61134
[LOGITS Ex2 A] Mean Abs: 2.413 | Max: 8.114
[LOSS Ex2] A: 0.08254 | B: 0.29790 | C: 0.20956
** [JOINT LOSS] ** : 0.811271
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005017 | Grad Max: 0.174433
  -> Layer: shared_layers.0.bias | Grad Mean: 0.387515 | Grad Max: 2.299068
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002023 | Grad Max: 0.005464
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000498 | Grad Max: 0.000498
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002575 | Grad Max: 0.523853
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.047191 | Grad Max: 2.926193
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000238 | Grad Max: 0.007033
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023502 | Grad Max: 0.117557
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000345
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004620 | Grad Max: 0.009617
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000227
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001098 | Grad Max: 0.003287
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000444 | Grad Max: 0.001964
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.013499 | Grad Max: 0.013499
[GRADIENT NORM TOTAL] 9.2888

[EPOCH SUMMARY] Train Loss: 0.8084

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.7910 | Alpha: 0.5500
!!! BEST MODEL SAVED !!! (Old: 0.7921 -> New: 0.7910)

############################## EPOCH 174/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.225
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50467604 0.49532393] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.089
[MASKS] A(Pass/Fail): 721/1327 | B: 667/1381 | C: 640/1408
[LOSS Ex1] A: 0.62271 | B: 0.60188 | C: 0.60506
[LOGITS Ex2 A] Mean Abs: 2.372 | Max: 6.969
[LOSS Ex2] A: 0.09637 | B: 0.27716 | C: 0.21620
** [JOINT LOSS] ** : 0.806458
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002434 | Grad Max: 0.082042
  -> Layer: shared_layers.0.bias | Grad Mean: 0.278686 | Grad Max: 1.141167
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.005604
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003997 | Grad Max: 0.003997
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001827 | Grad Max: 0.708885
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.033206 | Grad Max: 3.923262
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000165 | Grad Max: 0.006927
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017096 | Grad Max: 0.109724
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000273
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003157 | Grad Max: 0.008089
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000208
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000712 | Grad Max: 0.002567
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000337 | Grad Max: 0.001605
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009860 | Grad Max: 0.009860
[GRADIENT NORM TOTAL] 7.3684

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.263
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51130056 0.48869938] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 724/1324 | B: 624/1232 | C: 626/1422
[LOSS Ex1] A: 0.62164 | B: 0.60587 | C: 0.60500
[LOGITS Ex2 A] Mean Abs: 2.345 | Max: 6.300
[LOSS Ex2] A: 0.10542 | B: 0.28678 | C: 0.23278
** [JOINT LOSS] ** : 0.819162
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003229 | Grad Max: 0.083460
  -> Layer: shared_layers.0.bias | Grad Mean: 0.233306 | Grad Max: 0.890641
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.005564
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003691 | Grad Max: 0.003691
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001595 | Grad Max: 0.205006
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.028564 | Grad Max: 1.123028
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000160 | Grad Max: 0.005607
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015904 | Grad Max: 0.094219
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000239
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002890 | Grad Max: 0.006802
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000217
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000704 | Grad Max: 0.002979
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000342 | Grad Max: 0.001251
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.010904 | Grad Max: 0.010904
[GRADIENT NORM TOTAL] 5.0205

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.046
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004218  0.49957815] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.088
[MASKS] A(Pass/Fail): 691/1357 | B: 673/1375 | C: 662/1386
[LOSS Ex1] A: 0.63067 | B: 0.60600 | C: 0.59760
[LOGITS Ex2 A] Mean Abs: 2.370 | Max: 6.222
[LOSS Ex2] A: 0.09067 | B: 0.29836 | C: 0.21602
** [JOINT LOSS] ** : 0.813110
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005303 | Grad Max: 0.234053
  -> Layer: shared_layers.0.bias | Grad Mean: 0.544093 | Grad Max: 2.706027
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.005312
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003653 | Grad Max: 0.003653
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003497 | Grad Max: 0.511352
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065371 | Grad Max: 2.826324
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.013655
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037445 | Grad Max: 0.214354
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000460
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007003 | Grad Max: 0.014054
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000344
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001648 | Grad Max: 0.005145
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000670 | Grad Max: 0.002330
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.021231 | Grad Max: 0.021231
[GRADIENT NORM TOTAL] 11.9644

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.937
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5417878  0.45821217] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.084
[MASKS] A(Pass/Fail): 692/1356 | B: 654/1394 | C: 643/1405
[LOSS Ex1] A: 0.63145 | B: 0.60627 | C: 0.60212
[LOGITS Ex2 A] Mean Abs: 2.358 | Max: 6.586
[LOSS Ex2] A: 0.09708 | B: 0.29379 | C: 0.20501
** [JOINT LOSS] ** : 0.811908
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004304 | Grad Max: 0.178303
  -> Layer: shared_layers.0.bias | Grad Mean: 0.441285 | Grad Max: 2.402687
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002037 | Grad Max: 0.005524
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002884 | Grad Max: 0.002884
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002975 | Grad Max: 0.445887
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054982 | Grad Max: 2.461746
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.011856
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031080 | Grad Max: 0.181284
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000396
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005838 | Grad Max: 0.012074
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000276
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001404 | Grad Max: 0.004420
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000557 | Grad Max: 0.002183
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018658 | Grad Max: 0.018658
[GRADIENT NORM TOTAL] 10.0137

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.179
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.82776946 0.17223054] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.089
[MASKS] A(Pass/Fail): 750/1298 | B: 669/1379 | C: 645/1403
[LOSS Ex1] A: 0.62476 | B: 0.60174 | C: 0.60144
[LOGITS Ex2 A] Mean Abs: 2.364 | Max: 9.482
[LOSS Ex2] A: 0.08808 | B: 0.27486 | C: 0.20181
** [JOINT LOSS] ** : 0.797563
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003558 | Grad Max: 0.154165
  -> Layer: shared_layers.0.bias | Grad Mean: 0.346718 | Grad Max: 2.100041
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.005277
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002500 | Grad Max: 0.002500
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002253 | Grad Max: 0.542258
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041421 | Grad Max: 3.020621
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000201 | Grad Max: 0.007901
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020297 | Grad Max: 0.116531
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000356
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003889 | Grad Max: 0.008700
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000227
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000898 | Grad Max: 0.003631
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000411 | Grad Max: 0.001863
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011899 | Grad Max: 0.011899
[GRADIENT NORM TOTAL] 8.6662

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.311
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007146  0.49928543] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.088
[MASKS] A(Pass/Fail): 726/1322 | B: 624/1232 | C: 682/1366
[LOSS Ex1] A: 0.63207 | B: 0.60573 | C: 0.59740
[LOGITS Ex2 A] Mean Abs: 2.362 | Max: 6.954
[LOSS Ex2] A: 0.08569 | B: 0.29429 | C: 0.20382
** [JOINT LOSS] ** : 0.806336
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004936 | Grad Max: 0.197630
  -> Layer: shared_layers.0.bias | Grad Mean: 0.466388 | Grad Max: 2.695253
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.005437
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000578 | Grad Max: 0.000578
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003033 | Grad Max: 0.595597
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.055163 | Grad Max: 3.350343
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.008224
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.029056 | Grad Max: 0.144630
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000436
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005638 | Grad Max: 0.011885
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000334
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001311 | Grad Max: 0.005143
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000593 | Grad Max: 0.002007
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017711 | Grad Max: 0.017711
[GRADIENT NORM TOTAL] 10.7980

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 0.923
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.751702   0.24829794] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.089
[MASKS] A(Pass/Fail): 713/1335 | B: 673/1375 | C: 666/1382
[LOSS Ex1] A: 0.62701 | B: 0.60588 | C: 0.60085
[LOGITS Ex2 A] Mean Abs: 2.347 | Max: 6.592
[LOSS Ex2] A: 0.10130 | B: 0.29487 | C: 0.20836
** [JOINT LOSS] ** : 0.812756
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003995 | Grad Max: 0.113783
  -> Layer: shared_layers.0.bias | Grad Mean: 0.149579 | Grad Max: 0.835968
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002021 | Grad Max: 0.005851
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004469 | Grad Max: 0.004469
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001037 | Grad Max: 0.479124
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017139 | Grad Max: 2.666783
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.002858
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.002669 | Grad Max: 0.026688
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000174
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000395 | Grad Max: 0.002859
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000096
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000107 | Grad Max: 0.001039
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000323 | Grad Max: 0.001079
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.000497 | Grad Max: 0.000497
[GRADIENT NORM TOTAL] 4.6047

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.074
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64667594 0.35332406] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.089
[MASKS] A(Pass/Fail): 590/1026 | B: 655/1393 | C: 656/1392
[LOSS Ex1] A: 0.62510 | B: 0.60614 | C: 0.59902
[LOGITS Ex2 A] Mean Abs: 2.417 | Max: 9.562
[LOSS Ex2] A: 0.09288 | B: 0.30240 | C: 0.21965
** [JOINT LOSS] ** : 0.815066
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004642 | Grad Max: 0.233178
  -> Layer: shared_layers.0.bias | Grad Mean: 0.555941 | Grad Max: 3.037862
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002088 | Grad Max: 0.005479
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005678 | Grad Max: 0.005678
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003563 | Grad Max: 0.644318
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.066183 | Grad Max: 3.560100
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000330 | Grad Max: 0.011111
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033910 | Grad Max: 0.176565
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000499
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006419 | Grad Max: 0.014353
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000321
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001497 | Grad Max: 0.005059
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000580 | Grad Max: 0.002015
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018931 | Grad Max: 0.018931
[GRADIENT NORM TOTAL] 12.8964

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.312
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083137  0.49168622] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 729/1319 | B: 670/1378 | C: 652/1396
[LOSS Ex1] A: 0.62586 | B: 0.60161 | C: 0.60149
[LOGITS Ex2 A] Mean Abs: 2.389 | Max: 10.382
[LOSS Ex2] A: 0.08314 | B: 0.27545 | C: 0.19947
** [JOINT LOSS] ** : 0.795673
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004174 | Grad Max: 0.178013
  -> Layer: shared_layers.0.bias | Grad Mean: 0.350344 | Grad Max: 2.236244
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.005578
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000825 | Grad Max: 0.000825
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.474007
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041152 | Grad Max: 2.632708
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000172 | Grad Max: 0.005502
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.017662 | Grad Max: 0.086431
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000289
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003561 | Grad Max: 0.007923
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000242
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000903 | Grad Max: 0.003356
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000460 | Grad Max: 0.001724
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011630 | Grad Max: 0.011630
[GRADIENT NORM TOTAL] 8.5741

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.230
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5045797  0.49542025] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.089
[MASKS] A(Pass/Fail): 721/1327 | B: 625/1231 | C: 613/1435
[LOSS Ex1] A: 0.62251 | B: 0.60560 | C: 0.60563
[LOGITS Ex2 A] Mean Abs: 2.330 | Max: 7.217
[LOSS Ex2] A: 0.09691 | B: 0.29461 | C: 0.19642
** [JOINT LOSS] ** : 0.807227
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004537 | Grad Max: 0.143025
  -> Layer: shared_layers.0.bias | Grad Mean: 0.456747 | Grad Max: 2.066792
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006129
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003349 | Grad Max: 0.003349
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002837 | Grad Max: 0.338709
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.052671 | Grad Max: 1.897464
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000308 | Grad Max: 0.010592
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031361 | Grad Max: 0.169184
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000485
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005685 | Grad Max: 0.012963
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000313
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001254 | Grad Max: 0.004741
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000539 | Grad Max: 0.002037
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.015977 | Grad Max: 0.015977
[GRADIENT NORM TOTAL] 9.5148

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.267
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5113761  0.48862392] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 725/1323 | B: 673/1375 | C: 631/1417
[LOSS Ex1] A: 0.62145 | B: 0.60576 | C: 0.60106
[LOGITS Ex2 A] Mean Abs: 2.332 | Max: 6.951
[LOSS Ex2] A: 0.10004 | B: 0.29175 | C: 0.20735
** [JOINT LOSS] ** : 0.809137
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003689 | Grad Max: 0.147577
  -> Layer: shared_layers.0.bias | Grad Mean: 0.342668 | Grad Max: 1.833389
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005769
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008496 | Grad Max: 0.008496
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002229 | Grad Max: 0.252251
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.040129 | Grad Max: 1.398212
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000232 | Grad Max: 0.008356
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023496 | Grad Max: 0.123836
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000380
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004265 | Grad Max: 0.011227
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000235
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001013 | Grad Max: 0.003429
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000469 | Grad Max: 0.001773
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014706 | Grad Max: 0.014706
[GRADIENT NORM TOTAL] 7.2806

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.049
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003877 0.4996123] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.088
[MASKS] A(Pass/Fail): 691/1357 | B: 656/1392 | C: 662/1386
[LOSS Ex1] A: 0.63049 | B: 0.60602 | C: 0.60145
[LOGITS Ex2 A] Mean Abs: 2.359 | Max: 6.432
[LOSS Ex2] A: 0.09392 | B: 0.30380 | C: 0.20647
** [JOINT LOSS] ** : 0.814046
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007464 | Grad Max: 0.242551
  -> Layer: shared_layers.0.bias | Grad Mean: 0.675581 | Grad Max: 3.328815
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002011 | Grad Max: 0.005196
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006622 | Grad Max: 0.006622
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004294 | Grad Max: 0.732520
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.079103 | Grad Max: 4.099559
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000387 | Grad Max: 0.013265
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039462 | Grad Max: 0.234707
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000603
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007615 | Grad Max: 0.017632
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000360
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001780 | Grad Max: 0.005444
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000771 | Grad Max: 0.002495
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023921 | Grad Max: 0.023921
[GRADIENT NORM TOTAL] 15.2895

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.940
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5417358  0.45826414] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.084
[MASKS] A(Pass/Fail): 692/1356 | B: 670/1378 | C: 661/1387
[LOSS Ex1] A: 0.63128 | B: 0.60149 | C: 0.59692
[LOGITS Ex2 A] Mean Abs: 2.359 | Max: 6.492
[LOSS Ex2] A: 0.10122 | B: 0.28411 | C: 0.20673
** [JOINT LOSS] ** : 0.807251
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006528 | Grad Max: 0.296593
  -> Layer: shared_layers.0.bias | Grad Mean: 0.740089 | Grad Max: 3.788008
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005183
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006553 | Grad Max: 0.006553
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004550 | Grad Max: 0.834221
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.084174 | Grad Max: 4.619366
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000444 | Grad Max: 0.015686
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.046232 | Grad Max: 0.250357
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000577
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008789 | Grad Max: 0.018217
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000414
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002106 | Grad Max: 0.006701
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000840 | Grad Max: 0.002555
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027808 | Grad Max: 0.027808
[GRADIENT NORM TOTAL] 16.3541

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.183
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.82852674 0.17147328] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.089
[MASKS] A(Pass/Fail): 749/1299 | B: 625/1231 | C: 471/905
[LOSS Ex1] A: 0.62459 | B: 0.60549 | C: 0.59400
[LOGITS Ex2 A] Mean Abs: 2.371 | Max: 7.811
[LOSS Ex2] A: 0.08739 | B: 0.28163 | C: 0.19147
** [JOINT LOSS] ** : 0.794856
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003570 | Grad Max: 0.093626
  -> Layer: shared_layers.0.bias | Grad Mean: 0.197482 | Grad Max: 0.787393
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.005780
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002544 | Grad Max: 0.002544
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001468 | Grad Max: 0.418029
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026875 | Grad Max: 2.309893
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.005131
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.012285 | Grad Max: 0.079053
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000255
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002307 | Grad Max: 0.006255
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000174
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000549 | Grad Max: 0.002016
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000349 | Grad Max: 0.001472
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007047 | Grad Max: 0.007047
[GRADIENT NORM TOTAL] 5.4298

[EPOCH SUMMARY] Train Loss: 0.8079

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8018 | Alpha: 0.5500
No improve count: 1/15

############################## EPOCH 175/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.315
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007629 0.4992371] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.088
[MASKS] A(Pass/Fail): 726/1322 | B: 673/1375 | C: 667/1381
[LOSS Ex1] A: 0.63191 | B: 0.60565 | C: 0.59735
[LOGITS Ex2 A] Mean Abs: 2.307 | Max: 7.475
[LOSS Ex2] A: 0.09323 | B: 0.31544 | C: 0.17952
** [JOINT LOSS] ** : 0.807700
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009838 | Grad Max: 0.304862
  -> Layer: shared_layers.0.bias | Grad Mean: 0.853097 | Grad Max: 4.089119
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002083 | Grad Max: 0.005454
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000121 | Grad Max: 0.000121
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005663 | Grad Max: 1.105497
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.104019 | Grad Max: 6.114343
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000545 | Grad Max: 0.015991
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.055722 | Grad Max: 0.276606
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000811
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010920 | Grad Max: 0.022201
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000553
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002555 | Grad Max: 0.008574
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001114 | Grad Max: 0.003386
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033724 | Grad Max: 0.033724
[GRADIENT NORM TOTAL] 19.4519

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 0.926
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7522561 0.2477439] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.089
[MASKS] A(Pass/Fail): 713/1335 | B: 656/1392 | C: 628/1420
[LOSS Ex1] A: 0.62684 | B: 0.60592 | C: 0.60216
[LOGITS Ex2 A] Mean Abs: 2.279 | Max: 7.330
[LOSS Ex2] A: 0.11484 | B: 0.31985 | C: 0.20920
** [JOINT LOSS] ** : 0.826265
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.013621 | Grad Max: 0.348307
  -> Layer: shared_layers.0.bias | Grad Mean: 1.045884 | Grad Max: 4.595587
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.005029
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005893 | Grad Max: 0.005893
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006829 | Grad Max: 0.947800
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.126474 | Grad Max: 5.329916
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000698 | Grad Max: 0.019894
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.070692 | Grad Max: 0.334431
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.000980
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013718 | Grad Max: 0.027364
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000760
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003244 | Grad Max: 0.010822
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001470 | Grad Max: 0.003497
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044110 | Grad Max: 0.044110
[GRADIENT NORM TOTAL] 22.3725

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.077
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6469529 0.3530471] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.089
[MASKS] A(Pass/Fail): 590/1026 | B: 672/1376 | C: 654/1394
[LOSS Ex1] A: 0.62493 | B: 0.60139 | C: 0.60263
[LOGITS Ex2 A] Mean Abs: 2.363 | Max: 10.313
[LOSS Ex2] A: 0.09980 | B: 0.28422 | C: 0.20562
** [JOINT LOSS] ** : 0.806199
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007438 | Grad Max: 0.222790
  -> Layer: shared_layers.0.bias | Grad Mean: 0.456270 | Grad Max: 2.018845
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.005484
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002813 | Grad Max: 0.002813
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003185 | Grad Max: 0.734236
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057863 | Grad Max: 4.106436
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.008420
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030703 | Grad Max: 0.146659
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000590
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006191 | Grad Max: 0.014592
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000341
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001476 | Grad Max: 0.005177
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000724 | Grad Max: 0.002389
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020965 | Grad Max: 0.020965
[GRADIENT NORM TOTAL] 10.9506

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.316
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.508341   0.49165902] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 728/1320 | B: 625/1231 | C: 653/1395
[LOSS Ex1] A: 0.62570 | B: 0.60538 | C: 0.59614
[LOGITS Ex2 A] Mean Abs: 2.395 | Max: 10.314
[LOSS Ex2] A: 0.08836 | B: 0.28792 | C: 0.19868
** [JOINT LOSS] ** : 0.800728
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004308 | Grad Max: 0.217466
  -> Layer: shared_layers.0.bias | Grad Mean: 0.575676 | Grad Max: 2.885713
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.005403
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000993 | Grad Max: 0.000993
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003633 | Grad Max: 0.848790
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.067764 | Grad Max: 4.728732
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000351 | Grad Max: 0.012350
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036364 | Grad Max: 0.204932
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000464
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006786 | Grad Max: 0.013748
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000330
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001645 | Grad Max: 0.005120
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000686 | Grad Max: 0.002456
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022045 | Grad Max: 0.022045
[GRADIENT NORM TOTAL] 13.3760

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.233
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50447804 0.49552193] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 721/1327 | B: 673/1375 | C: 708/1340
[LOSS Ex1] A: 0.62235 | B: 0.60555 | C: 0.59349
[LOGITS Ex2 A] Mean Abs: 2.415 | Max: 6.994
[LOSS Ex2] A: 0.09778 | B: 0.32269 | C: 0.21669
** [JOINT LOSS] ** : 0.819519
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009976 | Grad Max: 0.448461
  -> Layer: shared_layers.0.bias | Grad Mean: 1.145258 | Grad Max: 6.090080
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006600
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001861 | Grad Max: 0.001861
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007069 | Grad Max: 1.485500
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.132417 | Grad Max: 8.285478
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000707 | Grad Max: 0.022706
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.073498 | Grad Max: 0.389052
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.000892
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013899 | Grad Max: 0.027694
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000698
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003268 | Grad Max: 0.010230
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001386 | Grad Max: 0.003592
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043008 | Grad Max: 0.043008
[GRADIENT NORM TOTAL] 25.6328

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.271
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51150876 0.48849118] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.089
[MASKS] A(Pass/Fail): 725/1323 | B: 658/1390 | C: 647/1401
[LOSS Ex1] A: 0.62129 | B: 0.60582 | C: 0.59904
[LOGITS Ex2 A] Mean Abs: 2.382 | Max: 6.905
[LOSS Ex2] A: 0.10617 | B: 0.31413 | C: 0.20649
** [JOINT LOSS] ** : 0.817641
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007816 | Grad Max: 0.298697
  -> Layer: shared_layers.0.bias | Grad Mean: 0.862895 | Grad Max: 4.091724
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.005778
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002669 | Grad Max: 0.002669
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005638 | Grad Max: 1.173109
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.104786 | Grad Max: 6.538060
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000556 | Grad Max: 0.017495
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.057382 | Grad Max: 0.278860
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000771
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010912 | Grad Max: 0.023293
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000515
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002558 | Grad Max: 0.007922
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001059 | Grad Max: 0.002937
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033541 | Grad Max: 0.033541
[GRADIENT NORM TOTAL] 19.5200

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.052
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004113  0.49958876] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.088
[MASKS] A(Pass/Fail): 691/1357 | B: 672/1376 | C: 669/1379
[LOSS Ex1] A: 0.63034 | B: 0.60130 | C: 0.60005
[LOGITS Ex2 A] Mean Abs: 2.289 | Max: 5.849
[LOSS Ex2] A: 0.09095 | B: 0.27320 | C: 0.21106
** [JOINT LOSS] ** : 0.802298
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002876 | Grad Max: 0.098796
  -> Layer: shared_layers.0.bias | Grad Mean: 0.242382 | Grad Max: 1.345094
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.005782
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008534 | Grad Max: 0.008534
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001475 | Grad Max: 0.552085
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026587 | Grad Max: 3.097513
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.004394
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.013091 | Grad Max: 0.078090
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000259
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002478 | Grad Max: 0.006691
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000171
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000603 | Grad Max: 0.002282
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000358 | Grad Max: 0.001650
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009028 | Grad Max: 0.009028
[GRADIENT NORM TOTAL] 6.0961

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.943
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.541706   0.45829394] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 692/1356 | B: 625/1231 | C: 636/1412
[LOSS Ex1] A: 0.63114 | B: 0.60529 | C: 0.59573
[LOGITS Ex2 A] Mean Abs: 2.267 | Max: 6.210
[LOSS Ex2] A: 0.10063 | B: 0.28369 | C: 0.20835
** [JOINT LOSS] ** : 0.808276
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004462 | Grad Max: 0.173319
  -> Layer: shared_layers.0.bias | Grad Mean: 0.500568 | Grad Max: 2.356085
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005284
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003399 | Grad Max: 0.003399
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003197 | Grad Max: 0.454253
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.059481 | Grad Max: 2.511084
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000342 | Grad Max: 0.012377
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035200 | Grad Max: 0.195682
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000501
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006628 | Grad Max: 0.014343
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000383
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001558 | Grad Max: 0.005838
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000696 | Grad Max: 0.002197
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.020642 | Grad Max: 0.020642
[GRADIENT NORM TOTAL] 10.6611

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.186
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8291448  0.17085524] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.089
[MASKS] A(Pass/Fail): 749/1299 | B: 675/1373 | C: 644/1404
[LOSS Ex1] A: 0.62445 | B: 0.60547 | C: 0.60157
[LOGITS Ex2 A] Mean Abs: 2.335 | Max: 7.758
[LOSS Ex2] A: 0.08712 | B: 0.29499 | C: 0.20315
** [JOINT LOSS] ** : 0.805581
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004433 | Grad Max: 0.133531
  -> Layer: shared_layers.0.bias | Grad Mean: 0.160500 | Grad Max: 0.802365
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005926
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000631 | Grad Max: 0.000631
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001338 | Grad Max: 0.206997
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023052 | Grad Max: 1.037719
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.005103
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007972 | Grad Max: 0.065151
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000147
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001188 | Grad Max: 0.004258
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000104
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000281 | Grad Max: 0.001374
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000305 | Grad Max: 0.001250
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004360 | Grad Max: 0.004360
[GRADIENT NORM TOTAL] 3.9758

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.318
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007845  0.49921548] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 726/1322 | B: 663/1385 | C: 657/1391
[LOSS Ex1] A: 0.63177 | B: 0.60572 | C: 0.60088
[LOGITS Ex2 A] Mean Abs: 2.402 | Max: 7.106
[LOSS Ex2] A: 0.08300 | B: 0.31088 | C: 0.20404
** [JOINT LOSS] ** : 0.812101
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005714 | Grad Max: 0.303205
  -> Layer: shared_layers.0.bias | Grad Mean: 0.768485 | Grad Max: 4.102230
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.004893
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002099 | Grad Max: 0.002099
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005069 | Grad Max: 0.783775
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.094804 | Grad Max: 4.331377
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000474 | Grad Max: 0.016212
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.049215 | Grad Max: 0.265388
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000647
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009283 | Grad Max: 0.018502
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000484
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002207 | Grad Max: 0.007297
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000873 | Grad Max: 0.002582
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028054 | Grad Max: 0.028054
[GRADIENT NORM TOTAL] 17.8014

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 0.928
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7527278  0.24727216] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.089
[MASKS] A(Pass/Fail): 713/1335 | B: 673/1375 | C: 685/1363
[LOSS Ex1] A: 0.62670 | B: 0.60120 | C: 0.59940
[LOGITS Ex2 A] Mean Abs: 2.372 | Max: 6.716
[LOSS Ex2] A: 0.10771 | B: 0.29012 | C: 0.22309
** [JOINT LOSS] ** : 0.816071
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004463 | Grad Max: 0.318381
  -> Layer: shared_layers.0.bias | Grad Mean: 0.752907 | Grad Max: 4.323667
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.005556
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005399 | Grad Max: 0.005399
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004883 | Grad Max: 0.758179
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.090849 | Grad Max: 4.211820
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000436 | Grad Max: 0.015832
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.045854 | Grad Max: 0.248037
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000571
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008434 | Grad Max: 0.017280
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000457
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001968 | Grad Max: 0.006956
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000791 | Grad Max: 0.002265
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025672 | Grad Max: 0.025672
[GRADIENT NORM TOTAL] 17.7795

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.080
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64723617 0.35276377] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 590/1026 | B: 625/1231 | C: 680/1368
[LOSS Ex1] A: 0.62479 | B: 0.60519 | C: 0.60800
[LOGITS Ex2 A] Mean Abs: 2.380 | Max: 8.272
[LOSS Ex2] A: 0.09819 | B: 0.27895 | C: 0.22537
** [JOINT LOSS] ** : 0.813497
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004712 | Grad Max: 0.135212
  -> Layer: shared_layers.0.bias | Grad Mean: 0.206931 | Grad Max: 1.157732
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.006074
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001905 | Grad Max: 0.001905
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001592 | Grad Max: 0.254734
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.027336 | Grad Max: 1.357807
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000107 | Grad Max: 0.004764
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.009761 | Grad Max: 0.071782
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000190
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001647 | Grad Max: 0.005007
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000130
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000386 | Grad Max: 0.001617
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000267 | Grad Max: 0.000986
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004100 | Grad Max: 0.004100
[GRADIENT NORM TOTAL] 5.1113

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.319
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083452  0.49165478] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 728/1320 | B: 675/1373 | C: 641/1407
[LOSS Ex1] A: 0.62556 | B: 0.60537 | C: 0.60209
[LOGITS Ex2 A] Mean Abs: 2.322 | Max: 10.590
[LOSS Ex2] A: 0.08958 | B: 0.31013 | C: 0.20957
** [JOINT LOSS] ** : 0.814102
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012020 | Grad Max: 0.307126
  -> Layer: shared_layers.0.bias | Grad Mean: 0.838868 | Grad Max: 4.057852
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.005446
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005408 | Grad Max: 0.005408
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005314 | Grad Max: 0.919225
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.097611 | Grad Max: 5.102306
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000530 | Grad Max: 0.015335
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.053822 | Grad Max: 0.260565
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000739
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010674 | Grad Max: 0.021618
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000581
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002579 | Grad Max: 0.008680
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001188 | Grad Max: 0.003475
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035530 | Grad Max: 0.035530
[GRADIENT NORM TOTAL] 17.9125

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.236
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044997  0.49550033] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 721/1327 | B: 663/1385 | C: 394/982
[LOSS Ex1] A: 0.62222 | B: 0.60562 | C: 0.61132
[LOGITS Ex2 A] Mean Abs: 2.302 | Max: 7.393
[LOSS Ex2] A: 0.09680 | B: 0.31810 | C: 0.23263
** [JOINT LOSS] ** : 0.828895
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009771 | Grad Max: 0.330906
  -> Layer: shared_layers.0.bias | Grad Mean: 0.908256 | Grad Max: 4.276681
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.005902
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002440 | Grad Max: 0.002440
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005765 | Grad Max: 1.013015
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.107783 | Grad Max: 5.606791
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000557 | Grad Max: 0.017812
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.057844 | Grad Max: 0.307251
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000733
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011147 | Grad Max: 0.022126
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000604
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002695 | Grad Max: 0.008630
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001188 | Grad Max: 0.003070
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037001 | Grad Max: 0.037001
[GRADIENT NORM TOTAL] 20.3421

[EPOCH SUMMARY] Train Loss: 0.8128

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.7921 | Alpha: 0.5500
No improve count: 2/15

############################## EPOCH 176/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.274
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51155525 0.48844478] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.089
[MASKS] A(Pass/Fail): 725/1323 | B: 673/1375 | C: 646/1402
[LOSS Ex1] A: 0.62115 | B: 0.60110 | C: 0.60301
[LOGITS Ex2 A] Mean Abs: 2.295 | Max: 7.810
[LOSS Ex2] A: 0.10192 | B: 0.27232 | C: 0.19584
** [JOINT LOSS] ** : 0.798452
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004767 | Grad Max: 0.169517
  -> Layer: shared_layers.0.bias | Grad Mean: 0.406693 | Grad Max: 2.326998
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.005832
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003771 | Grad Max: 0.003771
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.734370
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041024 | Grad Max: 4.045258
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000183 | Grad Max: 0.006965
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018633 | Grad Max: 0.104784
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000312
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003602 | Grad Max: 0.008417
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000221
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000871 | Grad Max: 0.003153
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000417 | Grad Max: 0.002012
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012088 | Grad Max: 0.012088
[GRADIENT NORM TOTAL] 9.4626

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.055
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50038296 0.49961704] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.564 | Std: 0.088
[MASKS] A(Pass/Fail): 690/1358 | B: 625/1231 | C: 645/1403
[LOSS Ex1] A: 0.63021 | B: 0.60508 | C: 0.59613
[LOGITS Ex2 A] Mean Abs: 2.328 | Max: 5.551
[LOSS Ex2] A: 0.09202 | B: 0.28117 | C: 0.22214
** [JOINT LOSS] ** : 0.808918
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006454 | Grad Max: 0.189014
  -> Layer: shared_layers.0.bias | Grad Mean: 0.556587 | Grad Max: 2.473382
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002109 | Grad Max: 0.005265
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004256 | Grad Max: 0.004256
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003601 | Grad Max: 0.550460
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.066531 | Grad Max: 3.027926
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000383 | Grad Max: 0.011153
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039469 | Grad Max: 0.204650
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000578
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007654 | Grad Max: 0.016003
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000400
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001843 | Grad Max: 0.005908
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000748 | Grad Max: 0.002410
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024358 | Grad Max: 0.024358
[GRADIENT NORM TOTAL] 11.8718

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.946
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5416846 0.4583154] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 692/1356 | B: 675/1373 | C: 640/1408
[LOSS Ex1] A: 0.63102 | B: 0.60527 | C: 0.59732
[LOGITS Ex2 A] Mean Abs: 2.314 | Max: 6.729
[LOSS Ex2] A: 0.10474 | B: 0.30602 | C: 0.18603
** [JOINT LOSS] ** : 0.810131
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008293 | Grad Max: 0.235354
  -> Layer: shared_layers.0.bias | Grad Mean: 0.698026 | Grad Max: 3.066502
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.005526
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008902 | Grad Max: 0.008902
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004475 | Grad Max: 0.654067
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.083127 | Grad Max: 3.637976
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000478 | Grad Max: 0.014288
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048956 | Grad Max: 0.244774
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000661
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009489 | Grad Max: 0.019240
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000495
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002281 | Grad Max: 0.007255
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000982 | Grad Max: 0.002913
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030466 | Grad Max: 0.030466
[GRADIENT NORM TOTAL] 14.6849

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.189
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8297274  0.17027256] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 749/1299 | B: 664/1384 | C: 673/1375
[LOSS Ex1] A: 0.62432 | B: 0.60553 | C: 0.59827
[LOGITS Ex2 A] Mean Abs: 2.323 | Max: 8.646
[LOSS Ex2] A: 0.09756 | B: 0.28975 | C: 0.21118
** [JOINT LOSS] ** : 0.808868
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004091 | Grad Max: 0.135214
  -> Layer: shared_layers.0.bias | Grad Mean: 0.143170 | Grad Max: 0.604071
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.005584
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004638 | Grad Max: 0.004638
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001168 | Grad Max: 0.432684
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.020419 | Grad Max: 2.347293
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.003359
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.007826 | Grad Max: 0.042020
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000230
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001601 | Grad Max: 0.004884
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000125
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000368 | Grad Max: 0.001374
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000333 | Grad Max: 0.001173
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.004160 | Grad Max: 0.004160
[GRADIENT NORM TOTAL] 4.3147

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.321
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50077283 0.49922717] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 726/1322 | B: 673/1375 | C: 655/1393
[LOSS Ex1] A: 0.63166 | B: 0.60100 | C: 0.59768
[LOGITS Ex2 A] Mean Abs: 2.281 | Max: 6.477
[LOSS Ex2] A: 0.09097 | B: 0.28981 | C: 0.20820
** [JOINT LOSS] ** : 0.806440
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009641 | Grad Max: 0.295676
  -> Layer: shared_layers.0.bias | Grad Mean: 0.839323 | Grad Max: 3.761452
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002076 | Grad Max: 0.005133
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003532 | Grad Max: 0.003532
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005253 | Grad Max: 0.668250
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.097118 | Grad Max: 3.666086
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.018441
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.055272 | Grad Max: 0.298657
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000727
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010780 | Grad Max: 0.021595
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000597
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002610 | Grad Max: 0.009134
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001149 | Grad Max: 0.003227
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035172 | Grad Max: 0.035172
[GRADIENT NORM TOTAL] 17.4550

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.931
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75322014 0.2467799 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.566 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 625/1231 | C: 678/1370
[LOSS Ex1] A: 0.62658 | B: 0.60497 | C: 0.60087
[LOGITS Ex2 A] Mean Abs: 2.258 | Max: 8.097
[LOSS Ex2] A: 0.10902 | B: 0.31417 | C: 0.22213
** [JOINT LOSS] ** : 0.825911
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010580 | Grad Max: 0.312053
  -> Layer: shared_layers.0.bias | Grad Mean: 0.922236 | Grad Max: 4.068727
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002090 | Grad Max: 0.005222
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002779 | Grad Max: 0.002779
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005780 | Grad Max: 0.726538
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.107350 | Grad Max: 4.019336
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000620 | Grad Max: 0.017326
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.063954 | Grad Max: 0.309973
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000832
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012350 | Grad Max: 0.024379
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000624
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002960 | Grad Max: 0.009674
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001289 | Grad Max: 0.003184
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039872 | Grad Max: 0.039872
[GRADIENT NORM TOTAL] 19.1998

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.084
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6474591  0.35254088] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 590/1026 | B: 675/1373 | C: 674/1374
[LOSS Ex1] A: 0.62467 | B: 0.60517 | C: 0.59681
[LOGITS Ex2 A] Mean Abs: 2.333 | Max: 11.805
[LOSS Ex2] A: 0.10076 | B: 0.30491 | C: 0.20244
** [JOINT LOSS] ** : 0.811586
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006327 | Grad Max: 0.168280
  -> Layer: shared_layers.0.bias | Grad Mean: 0.344519 | Grad Max: 1.465969
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.005915
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002386 | Grad Max: 0.002386
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002337 | Grad Max: 0.344608
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.041627 | Grad Max: 1.882188
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.007869
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.024560 | Grad Max: 0.132458
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000431
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004974 | Grad Max: 0.010628
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000275
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001195 | Grad Max: 0.004066
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000557 | Grad Max: 0.001981
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.016539 | Grad Max: 0.016539
[GRADIENT NORM TOTAL] 7.2624

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.323
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083105 0.4916895] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 728/1320 | B: 664/1384 | C: 649/1399
[LOSS Ex1] A: 0.62544 | B: 0.60542 | C: 0.60062
[LOGITS Ex2 A] Mean Abs: 2.372 | Max: 9.375
[LOSS Ex2] A: 0.09218 | B: 0.32349 | C: 0.23122
** [JOINT LOSS] ** : 0.826122
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007674 | Grad Max: 0.458716
  -> Layer: shared_layers.0.bias | Grad Mean: 1.068816 | Grad Max: 6.224372
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005626
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007668 | Grad Max: 0.007668
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006581 | Grad Max: 1.381277
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.123063 | Grad Max: 7.675564
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000591 | Grad Max: 0.018532
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.062200 | Grad Max: 0.338443
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000809
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011810 | Grad Max: 0.023549
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000589
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002833 | Grad Max: 0.009431
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001139 | Grad Max: 0.003137
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036723 | Grad Max: 0.036723
[GRADIENT NORM TOTAL] 25.0561

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.240
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50446427 0.4955357 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 721/1327 | B: 673/1375 | C: 668/1380
[LOSS Ex1] A: 0.62209 | B: 0.60090 | C: 0.60719
[LOGITS Ex2 A] Mean Abs: 2.407 | Max: 7.554
[LOSS Ex2] A: 0.10652 | B: 0.33220 | C: 0.25132
** [JOINT LOSS] ** : 0.840073
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.015401 | Grad Max: 0.679527
  -> Layer: shared_layers.0.bias | Grad Mean: 1.738411 | Grad Max: 9.155875
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.005962
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006361 | Grad Max: 0.006361
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.010840 | Grad Max: 1.969803
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.202533 | Grad Max: 10.932016
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001033 | Grad Max: 0.030437
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.108418 | Grad Max: 0.533545
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000138 | Grad Max: 0.001288
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.020717 | Grad Max: 0.040233
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000992
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004965 | Grad Max: 0.016357
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002068 | Grad Max: 0.004901
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.064879 | Grad Max: 0.064879
[GRADIENT NORM TOTAL] 39.0081

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.278
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5116212  0.48837882] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 725/1323 | B: 625/1231 | C: 649/1399
[LOSS Ex1] A: 0.62103 | B: 0.60487 | C: 0.60509
[LOGITS Ex2 A] Mean Abs: 2.369 | Max: 7.328
[LOSS Ex2] A: 0.11993 | B: 0.33389 | C: 0.23228
** [JOINT LOSS] ** : 0.839034
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.015557 | Grad Max: 0.606003
  -> Layer: shared_layers.0.bias | Grad Mean: 1.567606 | Grad Max: 8.038385
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.005943
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003477 | Grad Max: 0.003477
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.009911 | Grad Max: 1.763369
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.184381 | Grad Max: 9.800306
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000969 | Grad Max: 0.029891
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.101400 | Grad Max: 0.510848
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000131 | Grad Max: 0.001219
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.019654 | Grad Max: 0.038537
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000888
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004764 | Grad Max: 0.014826
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002063 | Grad Max: 0.004841
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.063717 | Grad Max: 0.063717
[GRADIENT NORM TOTAL] 34.8360

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.057
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003391 0.4996609] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.088
[MASKS] A(Pass/Fail): 690/1358 | B: 675/1373 | C: 682/1366
[LOSS Ex1] A: 0.63009 | B: 0.60508 | C: 0.59551
[LOGITS Ex2 A] Mean Abs: 2.302 | Max: 6.050
[LOSS Ex2] A: 0.09132 | B: 0.30923 | C: 0.20967
** [JOINT LOSS] ** : 0.813633
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008985 | Grad Max: 0.320433
  -> Layer: shared_layers.0.bias | Grad Mean: 0.742529 | Grad Max: 3.755771
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.005790
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001985 | Grad Max: 0.001985
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004664 | Grad Max: 0.733430
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.086578 | Grad Max: 4.065271
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000466 | Grad Max: 0.016186
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048268 | Grad Max: 0.259386
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000607
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009413 | Grad Max: 0.018363
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000399
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002313 | Grad Max: 0.006905
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000944 | Grad Max: 0.002976
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030664 | Grad Max: 0.030664
[GRADIENT NORM TOTAL] 16.2391

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.948
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54168504 0.45831496] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 693/1355 | B: 664/1384 | C: 657/1391
[LOSS Ex1] A: 0.63092 | B: 0.60534 | C: 0.59929
[LOGITS Ex2 A] Mean Abs: 2.234 | Max: 6.458
[LOSS Ex2] A: 0.09628 | B: 0.30228 | C: 0.20343
** [JOINT LOSS] ** : 0.812509
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007075 | Grad Max: 0.237467
  -> Layer: shared_layers.0.bias | Grad Mean: 0.620302 | Grad Max: 3.225569
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.005532
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009951 | Grad Max: 0.009951
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003888 | Grad Max: 0.925578
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.072216 | Grad Max: 5.112112
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.013230
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039991 | Grad Max: 0.218175
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000545
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007818 | Grad Max: 0.015970
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000454
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001906 | Grad Max: 0.006503
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000899 | Grad Max: 0.002905
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026795 | Grad Max: 0.026795
[GRADIENT NORM TOTAL] 14.0493

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.191
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8302773 0.1697227] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 749/1299 | B: 673/1375 | C: 616/1432
[LOSS Ex1] A: 0.62421 | B: 0.60082 | C: 0.60812
[LOGITS Ex2 A] Mean Abs: 2.236 | Max: 8.160
[LOSS Ex2] A: 0.09090 | B: 0.31453 | C: 0.23167
** [JOINT LOSS] ** : 0.823417
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008648 | Grad Max: 0.374222
  -> Layer: shared_layers.0.bias | Grad Mean: 1.014057 | Grad Max: 4.903749
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.005299
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001027 | Grad Max: 0.001027
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006684 | Grad Max: 1.138995
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.125916 | Grad Max: 6.286173
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000681 | Grad Max: 0.022003
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.071449 | Grad Max: 0.355191
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.000856
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013628 | Grad Max: 0.026281
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000649
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003303 | Grad Max: 0.010533
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001396 | Grad Max: 0.003381
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044123 | Grad Max: 0.044123
[GRADIENT NORM TOTAL] 22.9404

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.325
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50077987 0.49922007] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 726/1322 | B: 625/1231 | C: 469/907
[LOSS Ex1] A: 0.63155 | B: 0.60480 | C: 0.59451
[LOGITS Ex2 A] Mean Abs: 2.260 | Max: 6.926
[LOSS Ex2] A: 0.08420 | B: 0.31117 | C: 0.20496
** [JOINT LOSS] ** : 0.810398
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007051 | Grad Max: 0.302377
  -> Layer: shared_layers.0.bias | Grad Mean: 0.825366 | Grad Max: 4.064525
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.006189
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001021 | Grad Max: 0.001021
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005577 | Grad Max: 0.947340
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.103348 | Grad Max: 5.248195
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000549 | Grad Max: 0.017476
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.057967 | Grad Max: 0.289239
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000730
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011255 | Grad Max: 0.023302
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000598
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002706 | Grad Max: 0.009348
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001183 | Grad Max: 0.003291
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036619 | Grad Max: 0.036619
[GRADIENT NORM TOTAL] 18.9271

[EPOCH SUMMARY] Train Loss: 0.8168

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.7917 | Alpha: 0.5500
No improve count: 3/15

############################## EPOCH 177/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.933
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7535705  0.24642953] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 675/1373 | C: 646/1402
[LOSS Ex1] A: 0.62646 | B: 0.60501 | C: 0.60406
[LOGITS Ex2 A] Mean Abs: 2.271 | Max: 7.725
[LOSS Ex2] A: 0.10119 | B: 0.30364 | C: 0.20967
** [JOINT LOSS] ** : 0.816677
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002601 | Grad Max: 0.064463
  -> Layer: shared_layers.0.bias | Grad Mean: 0.176980 | Grad Max: 0.805885
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005192
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002578 | Grad Max: 0.002578
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001324 | Grad Max: 0.198173
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.023688 | Grad Max: 1.089600
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000114 | Grad Max: 0.004608
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011457 | Grad Max: 0.074510
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000237
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002063 | Grad Max: 0.005910
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000139
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000487 | Grad Max: 0.002017
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000306 | Grad Max: 0.001384
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.006703 | Grad Max: 0.006703
[GRADIENT NORM TOTAL] 4.2812

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.086
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64751923 0.3524808 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 590/1026 | B: 664/1384 | C: 648/1400
[LOSS Ex1] A: 0.62455 | B: 0.60526 | C: 0.60337
[LOGITS Ex2 A] Mean Abs: 2.390 | Max: 12.313
[LOSS Ex2] A: 0.10441 | B: 0.33511 | C: 0.20290
** [JOINT LOSS] ** : 0.825201
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012743 | Grad Max: 0.503062
  -> Layer: shared_layers.0.bias | Grad Mean: 1.257195 | Grad Max: 6.637047
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.005536
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007910 | Grad Max: 0.007910
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008072 | Grad Max: 1.273854
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.150060 | Grad Max: 7.126526
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000788 | Grad Max: 0.024811
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.082576 | Grad Max: 0.413432
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000107 | Grad Max: 0.001029
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.016124 | Grad Max: 0.031277
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000802
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003962 | Grad Max: 0.012663
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001647 | Grad Max: 0.004041
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052696 | Grad Max: 0.052696
[GRADIENT NORM TOTAL] 27.9295

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.326
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50835264 0.49164736] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 728/1320 | B: 673/1375 | C: 672/1376
[LOSS Ex1] A: 0.62532 | B: 0.60075 | C: 0.59849
[LOGITS Ex2 A] Mean Abs: 2.358 | Max: 9.378
[LOSS Ex2] A: 0.10273 | B: 0.35893 | C: 0.26500
** [JOINT LOSS] ** : 0.850408
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.017955 | Grad Max: 0.706707
  -> Layer: shared_layers.0.bias | Grad Mean: 1.797118 | Grad Max: 9.609874
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.005373
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000206 | Grad Max: 0.000206
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.011493 | Grad Max: 1.675225
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.213592 | Grad Max: 9.390087
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001136 | Grad Max: 0.034902
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.118895 | Grad Max: 0.602330
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000154 | Grad Max: 0.001463
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.023062 | Grad Max: 0.045407
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000060 | Grad Max: 0.001151
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.005587 | Grad Max: 0.018666
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002347 | Grad Max: 0.005865
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.072860 | Grad Max: 0.072860
[GRADIENT NORM TOTAL] 39.7501

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.242
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50437355 0.49562642] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 721/1327 | B: 625/1231 | C: 650/1398
[LOSS Ex1] A: 0.62198 | B: 0.60473 | C: 0.60203
[LOGITS Ex2 A] Mean Abs: 2.364 | Max: 6.491
[LOSS Ex2] A: 0.10376 | B: 0.34067 | C: 0.25229
** [JOINT LOSS] ** : 0.841820
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014724 | Grad Max: 0.536502
  -> Layer: shared_layers.0.bias | Grad Mean: 1.455223 | Grad Max: 7.018998
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.006578
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.013934 | Grad Max: 0.013934
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.009392 | Grad Max: 1.385051
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.174771 | Grad Max: 7.653933
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000974 | Grad Max: 0.030309
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.102527 | Grad Max: 0.530467
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000132 | Grad Max: 0.001242
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.019920 | Grad Max: 0.038326
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.001073
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004933 | Grad Max: 0.016441
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002082 | Grad Max: 0.005174
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.066274 | Grad Max: 0.066274
[GRADIENT NORM TOTAL] 31.6905

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.280
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51172817 0.4882719 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 725/1323 | B: 675/1373 | C: 630/1418
[LOSS Ex1] A: 0.62092 | B: 0.60495 | C: 0.60300
[LOGITS Ex2 A] Mean Abs: 2.285 | Max: 6.376
[LOSS Ex2] A: 0.10337 | B: 0.30643 | C: 0.20875
** [JOINT LOSS] ** : 0.815806
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007675 | Grad Max: 0.216533
  -> Layer: shared_layers.0.bias | Grad Mean: 0.655335 | Grad Max: 2.921336
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.006151
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000210 | Grad Max: 0.000210
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004349 | Grad Max: 0.566335
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.080491 | Grad Max: 3.160355
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000468 | Grad Max: 0.015867
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048683 | Grad Max: 0.247603
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000629
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009551 | Grad Max: 0.018669
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000484
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002346 | Grad Max: 0.007862
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000980 | Grad Max: 0.003036
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031040 | Grad Max: 0.031040
[GRADIENT NORM TOTAL] 13.8384

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.060
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50036466 0.49963534] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.088
[MASKS] A(Pass/Fail): 690/1358 | B: 664/1384 | C: 676/1372
[LOSS Ex1] A: 0.62998 | B: 0.60520 | C: 0.59504
[LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.286
[LOSS Ex2] A: 0.09770 | B: 0.31739 | C: 0.21319
** [JOINT LOSS] ** : 0.819501
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008084 | Grad Max: 0.291463
  -> Layer: shared_layers.0.bias | Grad Mean: 0.802231 | Grad Max: 3.723869
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005389
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003741 | Grad Max: 0.003741
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004892 | Grad Max: 0.906629
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.090887 | Grad Max: 5.089719
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000497 | Grad Max: 0.016187
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.052098 | Grad Max: 0.266044
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000661
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010111 | Grad Max: 0.020397
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000510
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002470 | Grad Max: 0.008187
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001052 | Grad Max: 0.003059
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032957 | Grad Max: 0.032957
[GRADIENT NORM TOTAL] 17.7342

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.949
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.541623   0.45837697] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 693/1355 | B: 673/1375 | C: 673/1375
[LOSS Ex1] A: 0.63081 | B: 0.60069 | C: 0.59743
[LOGITS Ex2 A] Mean Abs: 2.106 | Max: 6.179
[LOSS Ex2] A: 0.12230 | B: 0.33913 | C: 0.21808
** [JOINT LOSS] ** : 0.836146
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.016352 | Grad Max: 0.439893
  -> Layer: shared_layers.0.bias | Grad Mean: 1.348686 | Grad Max: 5.885354
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002090 | Grad Max: 0.005254
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007072 | Grad Max: 0.007072
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008465 | Grad Max: 1.337051
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.157538 | Grad Max: 7.400558
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000892 | Grad Max: 0.026617
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.092705 | Grad Max: 0.470533
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000122 | Grad Max: 0.001193
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.018107 | Grad Max: 0.035043
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000937
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004412 | Grad Max: 0.014248
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001891 | Grad Max: 0.004322
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.058377 | Grad Max: 0.058377
[GRADIENT NORM TOTAL] 28.8972

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.194
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8306088  0.16939126] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 747/1301 | B: 625/1231 | C: 674/1374
[LOSS Ex1] A: 0.62411 | B: 0.60466 | C: 0.59541
[LOGITS Ex2 A] Mean Abs: 2.160 | Max: 7.427
[LOSS Ex2] A: 0.09818 | B: 0.34529 | C: 0.22850
** [JOINT LOSS] ** : 0.832046
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014555 | Grad Max: 0.467094
  -> Layer: shared_layers.0.bias | Grad Mean: 1.286538 | Grad Max: 6.225985
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.006022
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003153 | Grad Max: 0.003153
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007957 | Grad Max: 1.152430
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.148720 | Grad Max: 6.380592
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000872 | Grad Max: 0.027025
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.091388 | Grad Max: 0.475451
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000119 | Grad Max: 0.001127
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.017816 | Grad Max: 0.034294
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000917
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004398 | Grad Max: 0.014211
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001872 | Grad Max: 0.004238
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.058969 | Grad Max: 0.058969
[GRADIENT NORM TOTAL] 27.1080

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.327
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50079536 0.49920467] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 726/1322 | B: 675/1373 | C: 682/1366
[LOSS Ex1] A: 0.63145 | B: 0.60488 | C: 0.59898
[LOGITS Ex2 A] Mean Abs: 2.221 | Max: 6.506
[LOSS Ex2] A: 0.09114 | B: 0.32030 | C: 0.19951
** [JOINT LOSS] ** : 0.815419
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009127 | Grad Max: 0.268276
  -> Layer: shared_layers.0.bias | Grad Mean: 0.717261 | Grad Max: 3.552707
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005213
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001095 | Grad Max: 0.001095
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004522 | Grad Max: 0.792040
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.082774 | Grad Max: 4.456845
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000471 | Grad Max: 0.013817
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048958 | Grad Max: 0.243816
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000679
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009757 | Grad Max: 0.020002
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000498
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002403 | Grad Max: 0.007997
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001063 | Grad Max: 0.003052
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032723 | Grad Max: 0.032723
[GRADIENT NORM TOTAL] 15.7850

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.934
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7537448 0.2462552] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 664/1384 | C: 686/1362
[LOSS Ex1] A: 0.62637 | B: 0.60514 | C: 0.60089
[LOGITS Ex2 A] Mean Abs: 2.244 | Max: 7.691
[LOSS Ex2] A: 0.10820 | B: 0.29639 | C: 0.20988
** [JOINT LOSS] ** : 0.815625
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005146 | Grad Max: 0.176535
  -> Layer: shared_layers.0.bias | Grad Mean: 0.594779 | Grad Max: 2.386675
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.005605
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003957 | Grad Max: 0.003957
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004035 | Grad Max: 0.687226
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.075033 | Grad Max: 3.828740
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.012511
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.041745 | Grad Max: 0.212567
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000508
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008134 | Grad Max: 0.015973
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000427
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002015 | Grad Max: 0.006621
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000797 | Grad Max: 0.002772
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026304 | Grad Max: 0.026304
[GRADIENT NORM TOTAL] 13.7124

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.088
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.647562   0.35243797] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 590/1026 | B: 673/1375 | C: 681/1367
[LOSS Ex1] A: 0.62447 | B: 0.60063 | C: 0.59774
[LOGITS Ex2 A] Mean Abs: 2.338 | Max: 8.876
[LOSS Ex2] A: 0.09456 | B: 0.31315 | C: 0.21942
** [JOINT LOSS] ** : 0.816653
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009883 | Grad Max: 0.392674
  -> Layer: shared_layers.0.bias | Grad Mean: 1.069519 | Grad Max: 5.330593
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002239 | Grad Max: 0.006339
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011851 | Grad Max: 0.011851
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006987 | Grad Max: 1.035984
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.130419 | Grad Max: 5.736405
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000676 | Grad Max: 0.021846
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.071771 | Grad Max: 0.372744
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.000867
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013943 | Grad Max: 0.027202
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000718
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003435 | Grad Max: 0.011170
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001384 | Grad Max: 0.003643
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044827 | Grad Max: 0.044827
[GRADIENT NORM TOTAL] 24.1507

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.328
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50833124 0.49166873] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 728/1320 | B: 625/1231 | C: 664/1384
[LOSS Ex1] A: 0.62524 | B: 0.60460 | C: 0.60058
[LOGITS Ex2 A] Mean Abs: 2.304 | Max: 7.834
[LOSS Ex2] A: 0.09412 | B: 0.29890 | C: 0.21822
** [JOINT LOSS] ** : 0.813886
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009598 | Grad Max: 0.315105
  -> Layer: shared_layers.0.bias | Grad Mean: 0.906907 | Grad Max: 4.300270
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005409
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000351 | Grad Max: 0.000351
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005974 | Grad Max: 0.910318
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.110863 | Grad Max: 5.112164
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000586 | Grad Max: 0.018105
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.061726 | Grad Max: 0.316824
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000760
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012165 | Grad Max: 0.023673
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000603
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003045 | Grad Max: 0.009350
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001244 | Grad Max: 0.003536
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040411 | Grad Max: 0.040411
[GRADIENT NORM TOTAL] 20.4554

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.244
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5043426  0.49565738] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 721/1327 | B: 675/1373 | C: 655/1393
[LOSS Ex1] A: 0.62191 | B: 0.60483 | C: 0.59904
[LOGITS Ex2 A] Mean Abs: 2.265 | Max: 6.588
[LOSS Ex2] A: 0.10000 | B: 0.30126 | C: 0.20064
** [JOINT LOSS] ** : 0.809223
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004032 | Grad Max: 0.117286
  -> Layer: shared_layers.0.bias | Grad Mean: 0.216186 | Grad Max: 1.168710
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.005912
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000665 | Grad Max: 0.000665
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.337230
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026813 | Grad Max: 1.869007
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000152 | Grad Max: 0.005409
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.015405 | Grad Max: 0.081479
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000270
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003002 | Grad Max: 0.006734
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000173
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000734 | Grad Max: 0.002558
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000389 | Grad Max: 0.001702
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.009713 | Grad Max: 0.009713
[GRADIENT NORM TOTAL] 5.0620

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.282
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5117374 0.4882626] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 725/1323 | B: 664/1384 | C: 452/924
[LOSS Ex1] A: 0.62085 | B: 0.60508 | C: 0.60435
[LOGITS Ex2 A] Mean Abs: 2.127 | Max: 6.700
[LOSS Ex2] A: 0.10709 | B: 0.34018 | C: 0.23034
** [JOINT LOSS] ** : 0.835964
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010595 | Grad Max: 0.382774
  -> Layer: shared_layers.0.bias | Grad Mean: 1.031270 | Grad Max: 4.879426
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002091 | Grad Max: 0.005807
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010050 | Grad Max: 0.010050
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006544 | Grad Max: 0.785962
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.122486 | Grad Max: 4.322562
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000660 | Grad Max: 0.019084
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.070025 | Grad Max: 0.343889
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.000862
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013667 | Grad Max: 0.026247
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000656
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003428 | Grad Max: 0.010455
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001500 | Grad Max: 0.003512
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047080 | Grad Max: 0.047080
[GRADIENT NORM TOTAL] 22.1416

[EPOCH SUMMARY] Train Loss: 0.8246

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8318 | Alpha: 0.5500
No improve count: 4/15

############################## EPOCH 178/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.061
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50030965 0.49969038] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.088
[MASKS] A(Pass/Fail): 690/1358 | B: 673/1375 | C: 686/1362
[LOSS Ex1] A: 0.62990 | B: 0.60057 | C: 0.59879
[LOGITS Ex2 A] Mean Abs: 2.082 | Max: 5.516
[LOSS Ex2] A: 0.11719 | B: 0.35197 | C: 0.23409
** [JOINT LOSS] ** : 0.844172
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.015495 | Grad Max: 0.481567
  -> Layer: shared_layers.0.bias | Grad Mean: 1.396227 | Grad Max: 6.374991
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.006215
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011007 | Grad Max: 0.011007
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.009258 | Grad Max: 1.310631
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.172538 | Grad Max: 7.253920
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000930 | Grad Max: 0.026989
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.097892 | Grad Max: 0.485254
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000128 | Grad Max: 0.001212
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.019253 | Grad Max: 0.037220
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000902
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004817 | Grad Max: 0.014913
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002034 | Grad Max: 0.004765
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.064382 | Grad Max: 0.064382
[GRADIENT NORM TOTAL] 30.8616

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.951
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5415582 0.4584418] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 693/1355 | B: 625/1231 | C: 658/1390
[LOSS Ex1] A: 0.63074 | B: 0.60454 | C: 0.59795
[LOGITS Ex2 A] Mean Abs: 2.086 | Max: 6.303
[LOSS Ex2] A: 0.11533 | B: 0.34524 | C: 0.21621
** [JOINT LOSS] ** : 0.836670
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014633 | Grad Max: 0.413523
  -> Layer: shared_layers.0.bias | Grad Mean: 1.235357 | Grad Max: 5.719989
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002049 | Grad Max: 0.005455
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008703 | Grad Max: 0.008703
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008129 | Grad Max: 1.166060
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.151406 | Grad Max: 6.436661
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000830 | Grad Max: 0.024344
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.087128 | Grad Max: 0.436899
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000115 | Grad Max: 0.001100
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.017232 | Grad Max: 0.033187
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000908
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004303 | Grad Max: 0.014079
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001828 | Grad Max: 0.004378
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057246 | Grad Max: 0.057246
[GRADIENT NORM TOTAL] 26.9739

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.196
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8309583  0.16904166] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 747/1301 | B: 676/1372 | C: 645/1403
[LOSS Ex1] A: 0.62403 | B: 0.60477 | C: 0.60211
[LOGITS Ex2 A] Mean Abs: 2.165 | Max: 6.873
[LOSS Ex2] A: 0.09053 | B: 0.32233 | C: 0.20309
** [JOINT LOSS] ** : 0.815617
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003730 | Grad Max: 0.201734
  -> Layer: shared_layers.0.bias | Grad Mean: 0.485077 | Grad Max: 2.755146
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.005916
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004359 | Grad Max: 0.004359
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003082 | Grad Max: 0.492783
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057136 | Grad Max: 2.723402
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000287 | Grad Max: 0.009168
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.030954 | Grad Max: 0.157951
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000407
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005901 | Grad Max: 0.012561
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000296
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001456 | Grad Max: 0.004758
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000590 | Grad Max: 0.002096
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.019392 | Grad Max: 0.019392
[GRADIENT NORM TOTAL] 11.1007

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.330
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008096  0.49919042] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.088
[MASKS] A(Pass/Fail): 726/1322 | B: 664/1384 | C: 679/1369
[LOSS Ex1] A: 0.63137 | B: 0.60502 | C: 0.59840
[LOGITS Ex2 A] Mean Abs: 2.265 | Max: 6.798
[LOSS Ex2] A: 0.08806 | B: 0.31233 | C: 0.20962
** [JOINT LOSS] ** : 0.814933
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008138 | Grad Max: 0.351002
  -> Layer: shared_layers.0.bias | Grad Mean: 0.926143 | Grad Max: 4.856454
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.005448
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000216 | Grad Max: 0.000216
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005819 | Grad Max: 0.989143
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.108718 | Grad Max: 5.546182
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000575 | Grad Max: 0.017449
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.061298 | Grad Max: 0.321369
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000737
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011967 | Grad Max: 0.023447
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000579
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002977 | Grad Max: 0.009540
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001165 | Grad Max: 0.003558
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038263 | Grad Max: 0.038263
[GRADIENT NORM TOTAL] 20.6755

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.936
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75404525 0.24595474] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 673/1375 | C: 665/1383
[LOSS Ex1] A: 0.62628 | B: 0.60051 | C: 0.59962
[LOGITS Ex2 A] Mean Abs: 2.251 | Max: 6.400
[LOSS Ex2] A: 0.11754 | B: 0.33876 | C: 0.24214
** [JOINT LOSS] ** : 0.841614
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012690 | Grad Max: 0.573189
  -> Layer: shared_layers.0.bias | Grad Mean: 1.509564 | Grad Max: 7.740436
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005088
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000396 | Grad Max: 0.000396
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.009585 | Grad Max: 1.493466
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.178645 | Grad Max: 8.291449
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000922 | Grad Max: 0.027532
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.098510 | Grad Max: 0.492495
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000125 | Grad Max: 0.001177
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.019276 | Grad Max: 0.036812
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000982
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004804 | Grad Max: 0.015995
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001953 | Grad Max: 0.004572
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.062996 | Grad Max: 0.062996
[GRADIENT NORM TOTAL] 33.7659

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.090
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6476921  0.35230792] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 590/1026 | B: 625/1231 | C: 678/1370
[LOSS Ex1] A: 0.62438 | B: 0.60448 | C: 0.60086
[LOGITS Ex2 A] Mean Abs: 2.309 | Max: 7.232
[LOSS Ex2] A: 0.10220 | B: 0.32787 | C: 0.21470
** [JOINT LOSS] ** : 0.824830
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011408 | Grad Max: 0.483699
  -> Layer: shared_layers.0.bias | Grad Mean: 1.256945 | Grad Max: 6.387123
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005658
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003209 | Grad Max: 0.003209
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007972 | Grad Max: 1.196292
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.148294 | Grad Max: 6.610897
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000774 | Grad Max: 0.023026
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.082366 | Grad Max: 0.411196
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000106 | Grad Max: 0.001023
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.016277 | Grad Max: 0.031388
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000798
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004153 | Grad Max: 0.012893
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001755 | Grad Max: 0.004518
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.056076 | Grad Max: 0.056076
[GRADIENT NORM TOTAL] 27.6184

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.330
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083298 0.4916702] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 728/1320 | B: 676/1372 | C: 675/1373
[LOSS Ex1] A: 0.62515 | B: 0.60471 | C: 0.59915
[LOGITS Ex2 A] Mean Abs: 2.244 | Max: 7.936
[LOSS Ex2] A: 0.09098 | B: 0.31119 | C: 0.21058
** [JOINT LOSS] ** : 0.813918
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005876 | Grad Max: 0.218066
  -> Layer: shared_layers.0.bias | Grad Mean: 0.610699 | Grad Max: 3.005422
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.005478
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003927 | Grad Max: 0.003927
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003934 | Grad Max: 0.588001
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.073182 | Grad Max: 3.261060
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.012607
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.040831 | Grad Max: 0.208376
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000531
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008020 | Grad Max: 0.016290
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000402
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002005 | Grad Max: 0.006243
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000780 | Grad Max: 0.002772
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025785 | Grad Max: 0.025785
[GRADIENT NORM TOTAL] 13.6125

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.246
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5043194 0.4956806] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 721/1327 | B: 664/1384 | C: 671/1377
[LOSS Ex1] A: 0.62181 | B: 0.60496 | C: 0.59831
[LOGITS Ex2 A] Mean Abs: 2.169 | Max: 6.745
[LOSS Ex2] A: 0.09990 | B: 0.30342 | C: 0.21889
** [JOINT LOSS] ** : 0.815762
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006835 | Grad Max: 0.227219
  -> Layer: shared_layers.0.bias | Grad Mean: 0.585403 | Grad Max: 3.097494
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.005570
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000833 | Grad Max: 0.000833
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003540 | Grad Max: 0.856583
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065935 | Grad Max: 4.733838
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000334 | Grad Max: 0.011307
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035126 | Grad Max: 0.186538
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000509
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006965 | Grad Max: 0.014325
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000363
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001777 | Grad Max: 0.006067
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000786 | Grad Max: 0.002656
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024349 | Grad Max: 0.024349
[GRADIENT NORM TOTAL] 13.5486

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.284
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51178336 0.48821664] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 725/1323 | B: 673/1375 | C: 667/1381
[LOSS Ex1] A: 0.62076 | B: 0.60046 | C: 0.60086
[LOGITS Ex2 A] Mean Abs: 2.106 | Max: 7.492
[LOSS Ex2] A: 0.10934 | B: 0.31171 | C: 0.22163
** [JOINT LOSS] ** : 0.821585
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010253 | Grad Max: 0.350202
  -> Layer: shared_layers.0.bias | Grad Mean: 0.991926 | Grad Max: 4.647109
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.005717
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004810 | Grad Max: 0.004810
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006262 | Grad Max: 1.051634
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.117396 | Grad Max: 5.802754
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000638 | Grad Max: 0.020951
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.067643 | Grad Max: 0.356323
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.000856
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013279 | Grad Max: 0.026948
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000685
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003385 | Grad Max: 0.010964
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001406 | Grad Max: 0.003719
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045002 | Grad Max: 0.045002
[GRADIENT NORM TOTAL] 21.7245

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.063
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50032    0.49967998] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.088
[MASKS] A(Pass/Fail): 690/1358 | B: 625/1231 | C: 684/1364
[LOSS Ex1] A: 0.62981 | B: 0.60443 | C: 0.60090
[LOGITS Ex2 A] Mean Abs: 2.091 | Max: 6.880
[LOSS Ex2] A: 0.10184 | B: 0.31230 | C: 0.21331
** [JOINT LOSS] ** : 0.820864
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009675 | Grad Max: 0.310430
  -> Layer: shared_layers.0.bias | Grad Mean: 0.917102 | Grad Max: 4.105142
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.005471
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002392 | Grad Max: 0.002392
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005876 | Grad Max: 1.015756
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.109613 | Grad Max: 5.629456
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000599 | Grad Max: 0.018799
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.063172 | Grad Max: 0.327054
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000870
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012403 | Grad Max: 0.025008
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000663
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003112 | Grad Max: 0.010500
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001338 | Grad Max: 0.003715
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041015 | Grad Max: 0.041015
[GRADIENT NORM TOTAL] 20.2504

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.952
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5414626  0.45853743] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 693/1355 | B: 676/1372 | C: 676/1372
[LOSS Ex1] A: 0.63065 | B: 0.60466 | C: 0.59568
[LOGITS Ex2 A] Mean Abs: 2.128 | Max: 6.020
[LOSS Ex2] A: 0.09972 | B: 0.30935 | C: 0.20459
** [JOINT LOSS] ** : 0.814883
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003722 | Grad Max: 0.095721
  -> Layer: shared_layers.0.bias | Grad Mean: 0.289827 | Grad Max: 1.389866
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.005749
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009965 | Grad Max: 0.009965
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.620611
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.035508 | Grad Max: 3.496173
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000180 | Grad Max: 0.005763
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018590 | Grad Max: 0.094042
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000316
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003809 | Grad Max: 0.008412
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000232
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000947 | Grad Max: 0.003722
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000434 | Grad Max: 0.002025
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.012131 | Grad Max: 0.012131
[GRADIENT NORM TOTAL] 7.3195

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.198
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8312995  0.16870058] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 747/1301 | B: 664/1384 | C: 672/1376
[LOSS Ex1] A: 0.62394 | B: 0.60491 | C: 0.59798
[LOGITS Ex2 A] Mean Abs: 2.212 | Max: 7.226
[LOSS Ex2] A: 0.10803 | B: 0.32319 | C: 0.20083
** [JOINT LOSS] ** : 0.819625
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011984 | Grad Max: 0.420977
  -> Layer: shared_layers.0.bias | Grad Mean: 1.042645 | Grad Max: 5.702648
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.006064
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000153 | Grad Max: 0.000153
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006493 | Grad Max: 1.324240
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.120091 | Grad Max: 7.351596
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000615 | Grad Max: 0.018477
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.065285 | Grad Max: 0.306428
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000826
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013103 | Grad Max: 0.025561
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000681
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003341 | Grad Max: 0.010421
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001348 | Grad Max: 0.003889
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043681 | Grad Max: 0.043681
[GRADIENT NORM TOTAL] 23.4887

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.332
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50084245 0.49915755] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 673/1375 | C: 667/1381
[LOSS Ex1] A: 0.63129 | B: 0.60040 | C: 0.60223
[LOGITS Ex2 A] Mean Abs: 2.254 | Max: 5.890
[LOSS Ex2] A: 0.10448 | B: 0.33430 | C: 0.25261
** [JOINT LOSS] ** : 0.841770
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.015344 | Grad Max: 0.631404
  -> Layer: shared_layers.0.bias | Grad Mean: 1.580361 | Grad Max: 8.315900
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.005019
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004480 | Grad Max: 0.004480
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.009919 | Grad Max: 1.707251
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.184946 | Grad Max: 9.513133
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000962 | Grad Max: 0.028312
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.102550 | Grad Max: 0.511825
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000133 | Grad Max: 0.001259
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.020324 | Grad Max: 0.038861
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000055 | Grad Max: 0.001005
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.005215 | Grad Max: 0.016692
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002139 | Grad Max: 0.005191
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.069059 | Grad Max: 0.069059
[GRADIENT NORM TOTAL] 35.1114

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.937
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75429577 0.24570425] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 625/1231 | C: 433/943
[LOSS Ex1] A: 0.62619 | B: 0.60437 | C: 0.60538
[LOGITS Ex2 A] Mean Abs: 2.236 | Max: 6.447
[LOSS Ex2] A: 0.11590 | B: 0.32659 | C: 0.22910
** [JOINT LOSS] ** : 0.835842
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014054 | Grad Max: 0.489677
  -> Layer: shared_layers.0.bias | Grad Mean: 1.264088 | Grad Max: 6.436862
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005185
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001868 | Grad Max: 0.001868
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007995 | Grad Max: 1.360477
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.148098 | Grad Max: 7.557583
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000783 | Grad Max: 0.023798
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.083062 | Grad Max: 0.413053
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000110 | Grad Max: 0.001073
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.016768 | Grad Max: 0.032966
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000786
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004351 | Grad Max: 0.013083
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001810 | Grad Max: 0.004578
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.058630 | Grad Max: 0.058630
[GRADIENT NORM TOTAL] 27.6909

[EPOCH SUMMARY] Train Loss: 0.8259

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.7980 | Alpha: 0.5500
No improve count: 5/15

############################## EPOCH 179/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.092
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64778376 0.35221627] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 590/1026 | B: 676/1372 | C: 676/1372
[LOSS Ex1] A: 0.62429 | B: 0.60461 | C: 0.60094
[LOGITS Ex2 A] Mean Abs: 2.238 | Max: 11.010
[LOSS Ex2] A: 0.10081 | B: 0.30919 | C: 0.20766
** [JOINT LOSS] ** : 0.815836
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006441 | Grad Max: 0.223745
  -> Layer: shared_layers.0.bias | Grad Mean: 0.581253 | Grad Max: 2.850693
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.005370
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006442 | Grad Max: 0.006442
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003537 | Grad Max: 0.548643
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065670 | Grad Max: 3.037905
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000350 | Grad Max: 0.011980
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036843 | Grad Max: 0.195814
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000539
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007364 | Grad Max: 0.015009
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000375
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001865 | Grad Max: 0.005987
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000765 | Grad Max: 0.002722
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024483 | Grad Max: 0.024483
[GRADIENT NORM TOTAL] 12.5158

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.333
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083524 0.4916476] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 728/1320 | B: 664/1384 | C: 682/1366
[LOSS Ex1] A: 0.62506 | B: 0.60486 | C: 0.60183
[LOGITS Ex2 A] Mean Abs: 2.142 | Max: 8.550
[LOSS Ex2] A: 0.09107 | B: 0.31905 | C: 0.22110
** [JOINT LOSS] ** : 0.820994
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006629 | Grad Max: 0.217803
  -> Layer: shared_layers.0.bias | Grad Mean: 0.624401 | Grad Max: 2.761595
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005495
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000852 | Grad Max: 0.000852
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004220 | Grad Max: 0.580231
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.078670 | Grad Max: 3.208545
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000429 | Grad Max: 0.013728
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.046005 | Grad Max: 0.220534
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000677
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009165 | Grad Max: 0.019732
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000531
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002350 | Grad Max: 0.008490
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001012 | Grad Max: 0.003261
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031983 | Grad Max: 0.031983
[GRADIENT NORM TOTAL] 13.9306

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.248
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5042411 0.4957589] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 720/1328 | B: 673/1375 | C: 666/1382
[LOSS Ex1] A: 0.62173 | B: 0.60036 | C: 0.60293
[LOGITS Ex2 A] Mean Abs: 2.127 | Max: 7.436
[LOSS Ex2] A: 0.10279 | B: 0.32214 | C: 0.20089
** [JOINT LOSS] ** : 0.816945
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009765 | Grad Max: 0.338026
  -> Layer: shared_layers.0.bias | Grad Mean: 0.957211 | Grad Max: 4.494227
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002200 | Grad Max: 0.005773
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007534 | Grad Max: 0.007534
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006349 | Grad Max: 1.071605
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.118654 | Grad Max: 5.917881
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000621 | Grad Max: 0.020885
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.066322 | Grad Max: 0.351665
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000855
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013081 | Grad Max: 0.025685
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000696
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003308 | Grad Max: 0.011082
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001366 | Grad Max: 0.003612
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043120 | Grad Max: 0.043120
[GRADIENT NORM TOTAL] 21.7681

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.286
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51185    0.48815003] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 725/1323 | B: 625/1231 | C: 728/1320
[LOSS Ex1] A: 0.62067 | B: 0.60432 | C: 0.59106
[LOGITS Ex2 A] Mean Abs: 2.095 | Max: 6.099
[LOSS Ex2] A: 0.11178 | B: 0.30989 | C: 0.20256
** [JOINT LOSS] ** : 0.813425
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007969 | Grad Max: 0.333315
  -> Layer: shared_layers.0.bias | Grad Mean: 0.867433 | Grad Max: 4.286891
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.005940
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001924 | Grad Max: 0.001924
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005557 | Grad Max: 0.918778
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.103954 | Grad Max: 5.065749
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000549 | Grad Max: 0.016703
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.058875 | Grad Max: 0.281448
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000765
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011546 | Grad Max: 0.023796
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000631
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002938 | Grad Max: 0.010108
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001221 | Grad Max: 0.003486
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038688 | Grad Max: 0.038688
[GRADIENT NORM TOTAL] 19.3267

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.064
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003217 0.4996783] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 690/1358 | B: 676/1372 | C: 700/1348
[LOSS Ex1] A: 0.62973 | B: 0.60457 | C: 0.59634
[LOGITS Ex2 A] Mean Abs: 2.137 | Max: 5.424
[LOSS Ex2] A: 0.09548 | B: 0.30624 | C: 0.21640
** [JOINT LOSS] ** : 0.816251
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002847 | Grad Max: 0.105010
  -> Layer: shared_layers.0.bias | Grad Mean: 0.290213 | Grad Max: 1.462826
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.005705
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009280 | Grad Max: 0.009280
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001788 | Grad Max: 0.233357
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.032828 | Grad Max: 1.302356
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.005994
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.018046 | Grad Max: 0.104424
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000316
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.003478 | Grad Max: 0.008410
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000232
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000843 | Grad Max: 0.003549
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000420 | Grad Max: 0.002078
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.011647 | Grad Max: 0.011647
[GRADIENT NORM TOTAL] 6.1569

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.954
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5413787  0.45862132] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 693/1355 | B: 664/1384 | C: 643/1405
[LOSS Ex1] A: 0.63057 | B: 0.60481 | C: 0.60104
[LOGITS Ex2 A] Mean Abs: 2.199 | Max: 6.035
[LOSS Ex2] A: 0.10271 | B: 0.32502 | C: 0.21349
** [JOINT LOSS] ** : 0.825877
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008517 | Grad Max: 0.368495
  -> Layer: shared_layers.0.bias | Grad Mean: 1.002329 | Grad Max: 4.946062
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.005361
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009546 | Grad Max: 0.009546
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006288 | Grad Max: 1.070747
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.117348 | Grad Max: 5.916045
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000619 | Grad Max: 0.019206
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.066876 | Grad Max: 0.327694
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000863
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013285 | Grad Max: 0.026612
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000695
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003434 | Grad Max: 0.010514
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001415 | Grad Max: 0.003864
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045398 | Grad Max: 0.045398
[GRADIENT NORM TOTAL] 22.3637

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.200
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8316228  0.16837719] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 747/1301 | B: 673/1375 | C: 647/1401
[LOSS Ex1] A: 0.62386 | B: 0.60030 | C: 0.59851
[LOGITS Ex2 A] Mean Abs: 2.237 | Max: 7.101
[LOSS Ex2] A: 0.11859 | B: 0.34749 | C: 0.23693
** [JOINT LOSS] ** : 0.841895
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.016028 | Grad Max: 0.598278
  -> Layer: shared_layers.0.bias | Grad Mean: 1.612443 | Grad Max: 8.055904
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.005621
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000474 | Grad Max: 0.000474
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.010207 | Grad Max: 1.544098
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.189630 | Grad Max: 8.640054
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.001010 | Grad Max: 0.029349
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.108224 | Grad Max: 0.525109
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000141 | Grad Max: 0.001435
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.021612 | Grad Max: 0.042000
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000059 | Grad Max: 0.001139
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.005566 | Grad Max: 0.017823
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002324 | Grad Max: 0.005489
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.073517 | Grad Max: 0.073517
[GRADIENT NORM TOTAL] 35.3616

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.334
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008749  0.49912515] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 625/1231 | C: 672/1376
[LOSS Ex1] A: 0.63120 | B: 0.60427 | C: 0.60061
[LOGITS Ex2 A] Mean Abs: 2.238 | Max: 6.311
[LOSS Ex2] A: 0.11064 | B: 0.34524 | C: 0.25644
** [JOINT LOSS] ** : 0.849467
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014291 | Grad Max: 0.495274
  -> Layer: shared_layers.0.bias | Grad Mean: 1.425849 | Grad Max: 6.624592
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002053 | Grad Max: 0.005452
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003590 | Grad Max: 0.003590
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.009085 | Grad Max: 1.330263
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.169754 | Grad Max: 7.335798
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000909 | Grad Max: 0.028273
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.097472 | Grad Max: 0.484537
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000127 | Grad Max: 0.001222
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.019403 | Grad Max: 0.037876
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.001045
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.005017 | Grad Max: 0.016646
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002079 | Grad Max: 0.004780
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.066100 | Grad Max: 0.066100
[GRADIENT NORM TOTAL] 30.7631

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.939
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75455636 0.24544364] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 676/1372 | C: 704/1344
[LOSS Ex1] A: 0.62610 | B: 0.60451 | C: 0.59889
[LOGITS Ex2 A] Mean Abs: 2.200 | Max: 6.398
[LOSS Ex2] A: 0.11245 | B: 0.31611 | C: 0.21232
** [JOINT LOSS] ** : 0.823463
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006240 | Grad Max: 0.236515
  -> Layer: shared_layers.0.bias | Grad Mean: 0.711698 | Grad Max: 3.210121
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.006094
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000309 | Grad Max: 0.000309
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004517 | Grad Max: 0.665101
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.084343 | Grad Max: 3.653263
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000474 | Grad Max: 0.014833
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.051033 | Grad Max: 0.255775
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000657
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010116 | Grad Max: 0.020944
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000561
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002626 | Grad Max: 0.008284
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001062 | Grad Max: 0.003246
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034629 | Grad Max: 0.034629
[GRADIENT NORM TOTAL] 15.1068

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.093
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.647944 0.352056] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 590/1026 | B: 664/1384 | C: 663/1385
[LOSS Ex1] A: 0.62420 | B: 0.60476 | C: 0.60062
[LOGITS Ex2 A] Mean Abs: 2.154 | Max: 9.626
[LOSS Ex2] A: 0.09728 | B: 0.30914 | C: 0.21092
** [JOINT LOSS] ** : 0.815636
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009556 | Grad Max: 0.272860
  -> Layer: shared_layers.0.bias | Grad Mean: 0.696616 | Grad Max: 3.675936
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005648
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009743 | Grad Max: 0.009743
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004138 | Grad Max: 0.933104
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.076476 | Grad Max: 5.163054
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.012262
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.041810 | Grad Max: 0.220111
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000613
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008555 | Grad Max: 0.017202
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000439
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002226 | Grad Max: 0.007068
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001016 | Grad Max: 0.003303
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030432 | Grad Max: 0.030432
[GRADIENT NORM TOTAL] 15.4797

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.335
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083989  0.49160108] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 673/1375 | C: 652/1396
[LOSS Ex1] A: 0.62498 | B: 0.60025 | C: 0.60158
[LOGITS Ex2 A] Mean Abs: 2.118 | Max: 7.771
[LOSS Ex2] A: 0.09648 | B: 0.32054 | C: 0.21210
** [JOINT LOSS] ** : 0.818645
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.013624 | Grad Max: 0.368030
  -> Layer: shared_layers.0.bias | Grad Mean: 1.120992 | Grad Max: 5.025382
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.005339
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002526 | Grad Max: 0.002526
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006983 | Grad Max: 1.146319
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.129799 | Grad Max: 6.337554
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000725 | Grad Max: 0.020706
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.076519 | Grad Max: 0.381745
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000103 | Grad Max: 0.001077
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.015424 | Grad Max: 0.030703
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000800
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003963 | Grad Max: 0.013128
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001693 | Grad Max: 0.004503
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052305 | Grad Max: 0.052305
[GRADIENT NORM TOTAL] 23.7567

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.250
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041692 0.4958307] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 720/1328 | B: 625/1231 | C: 639/1409
[LOSS Ex1] A: 0.62164 | B: 0.60421 | C: 0.60219
[LOGITS Ex2 A] Mean Abs: 2.106 | Max: 7.304
[LOSS Ex2] A: 0.10431 | B: 0.33565 | C: 0.21990
** [JOINT LOSS] ** : 0.829302
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011736 | Grad Max: 0.358405
  -> Layer: shared_layers.0.bias | Grad Mean: 1.038439 | Grad Max: 4.684211
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.005492
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001884 | Grad Max: 0.001884
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006428 | Grad Max: 0.989724
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.119218 | Grad Max: 5.487039
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000658 | Grad Max: 0.018414
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.069797 | Grad Max: 0.329592
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.000926
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.014103 | Grad Max: 0.027276
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000752
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003658 | Grad Max: 0.011856
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001578 | Grad Max: 0.003902
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048762 | Grad Max: 0.048762
[GRADIENT NORM TOTAL] 21.9857

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.288
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5119436  0.48805642] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 726/1322 | B: 676/1372 | C: 685/1363
[LOSS Ex1] A: 0.62058 | B: 0.60446 | C: 0.59540
[LOGITS Ex2 A] Mean Abs: 2.127 | Max: 6.011
[LOSS Ex2] A: 0.10485 | B: 0.31134 | C: 0.20016
** [JOINT LOSS] ** : 0.812267
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004016 | Grad Max: 0.177553
  -> Layer: shared_layers.0.bias | Grad Mean: 0.419811 | Grad Max: 2.440453
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002227 | Grad Max: 0.006514
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003037 | Grad Max: 0.003037
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002462 | Grad Max: 0.455359
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.045314 | Grad Max: 2.506955
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.007705
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.023397 | Grad Max: 0.138219
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000329
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004612 | Grad Max: 0.010213
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000290
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001225 | Grad Max: 0.003785
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000547 | Grad Max: 0.002299
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.017153 | Grad Max: 0.017153
[GRADIENT NORM TOTAL] 9.3301

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.066
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003635  0.49963647] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 688/1360 | B: 664/1384 | C: 425/951
[LOSS Ex1] A: 0.62963 | B: 0.60470 | C: 0.60330
[LOGITS Ex2 A] Mean Abs: 2.163 | Max: 5.662
[LOSS Ex2] A: 0.10351 | B: 0.31160 | C: 0.22496
** [JOINT LOSS] ** : 0.825904
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008084 | Grad Max: 0.227132
  -> Layer: shared_layers.0.bias | Grad Mean: 0.695873 | Grad Max: 3.225359
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002000 | Grad Max: 0.005259
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006890 | Grad Max: 0.006890
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004484 | Grad Max: 0.759811
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.083223 | Grad Max: 4.253567
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000430 | Grad Max: 0.014776
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.045808 | Grad Max: 0.251773
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000655
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009168 | Grad Max: 0.018644
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000453
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002359 | Grad Max: 0.007281
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000942 | Grad Max: 0.002946
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030365 | Grad Max: 0.030365
[GRADIENT NORM TOTAL] 15.6298

[EPOCH SUMMARY] Train Loss: 0.8233

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8140 | Alpha: 0.5500
No improve count: 6/15

############################## EPOCH 180/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.955
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5412638  0.45873618] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 693/1355 | B: 674/1374 | C: 661/1387
[LOSS Ex1] A: 0.63048 | B: 0.60019 | C: 0.60583
[LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.055
[LOSS Ex2] A: 0.11990 | B: 0.32010 | C: 0.22744
** [JOINT LOSS] ** : 0.834645
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012204 | Grad Max: 0.417684
  -> Layer: shared_layers.0.bias | Grad Mean: 1.140199 | Grad Max: 5.679196
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.005966
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012628 | Grad Max: 0.012628
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007102 | Grad Max: 1.288027
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.132314 | Grad Max: 7.170654
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000687 | Grad Max: 0.020348
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.073394 | Grad Max: 0.351500
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.000999
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.014642 | Grad Max: 0.029316
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000759
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003771 | Grad Max: 0.011659
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001551 | Grad Max: 0.003993
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.049233 | Grad Max: 0.049233
[GRADIENT NORM TOTAL] 25.1413

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.202
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8319328  0.16806722] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 747/1301 | B: 626/1230 | C: 665/1383
[LOSS Ex1] A: 0.62378 | B: 0.60415 | C: 0.59981
[LOGITS Ex2 A] Mean Abs: 2.191 | Max: 6.391
[LOSS Ex2] A: 0.11037 | B: 0.30026 | C: 0.21245
** [JOINT LOSS] ** : 0.816939
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011974 | Grad Max: 0.313637
  -> Layer: shared_layers.0.bias | Grad Mean: 0.864074 | Grad Max: 4.338882
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.005765
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003586 | Grad Max: 0.003586
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005484 | Grad Max: 0.945302
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.101382 | Grad Max: 5.257596
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000547 | Grad Max: 0.016274
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.057359 | Grad Max: 0.278352
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000753
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011722 | Grad Max: 0.022523
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000600
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003084 | Grad Max: 0.009637
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001305 | Grad Max: 0.003684
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040956 | Grad Max: 0.040956
[GRADIENT NORM TOTAL] 18.8770

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.336
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009262  0.49907383] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 676/1372 | C: 712/1336
[LOSS Ex1] A: 0.63112 | B: 0.60440 | C: 0.58453
[LOGITS Ex2 A] Mean Abs: 2.161 | Max: 5.975
[LOSS Ex2] A: 0.09550 | B: 0.30381 | C: 0.20544
** [JOINT LOSS] ** : 0.808270
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.001980 | Grad Max: 0.062157
  -> Layer: shared_layers.0.bias | Grad Mean: 0.114037 | Grad Max: 0.883530
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005563
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007189 | Grad Max: 0.007189
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.000739 | Grad Max: 0.177871
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.012901 | Grad Max: 0.953984
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.002454
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.005609 | Grad Max: 0.035070
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000143
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.001001 | Grad Max: 0.004031
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000135
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000241 | Grad Max: 0.001256
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000443 | Grad Max: 0.001144
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.002470 | Grad Max: 0.002470
[GRADIENT NORM TOTAL] 2.6280

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.940
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75479656 0.2452034 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 664/1384 | C: 654/1394
[LOSS Ex1] A: 0.62601 | B: 0.60464 | C: 0.60668
[LOGITS Ex2 A] Mean Abs: 2.083 | Max: 6.197
[LOSS Ex2] A: 0.11674 | B: 0.32863 | C: 0.21200
** [JOINT LOSS] ** : 0.831562
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010734 | Grad Max: 0.321227
  -> Layer: shared_layers.0.bias | Grad Mean: 0.951551 | Grad Max: 4.186574
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002019 | Grad Max: 0.005428
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003641 | Grad Max: 0.003641
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005879 | Grad Max: 0.729337
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.109604 | Grad Max: 4.022493
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000598 | Grad Max: 0.017524
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.063893 | Grad Max: 0.326924
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.000855
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012841 | Grad Max: 0.025220
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000713
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003331 | Grad Max: 0.011406
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001403 | Grad Max: 0.003672
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043774 | Grad Max: 0.043774
[GRADIENT NORM TOTAL] 19.7634

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.095
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64798343 0.35201657] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 591/1025 | B: 674/1374 | C: 687/1361
[LOSS Ex1] A: 0.62408 | B: 0.60013 | C: 0.60138
[LOGITS Ex2 A] Mean Abs: 2.109 | Max: 8.538
[LOSS Ex2] A: 0.11295 | B: 0.33531 | C: 0.22216
** [JOINT LOSS] ** : 0.832004
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.013421 | Grad Max: 0.375179
  -> Layer: shared_layers.0.bias | Grad Mean: 1.102917 | Grad Max: 5.012146
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.005150
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000821 | Grad Max: 0.000821
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007031 | Grad Max: 0.938939
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.130475 | Grad Max: 5.176515
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000710 | Grad Max: 0.021349
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.075510 | Grad Max: 0.379250
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001005
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.015270 | Grad Max: 0.029988
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000862
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003986 | Grad Max: 0.012963
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001693 | Grad Max: 0.004164
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.052847 | Grad Max: 0.052847
[GRADIENT NORM TOTAL] 23.4721

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.338
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084681  0.49153194] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 626/1230 | C: 661/1387
[LOSS Ex1] A: 0.62486 | B: 0.60408 | C: 0.60208
[LOGITS Ex2 A] Mean Abs: 2.103 | Max: 7.781
[LOSS Ex2] A: 0.10128 | B: 0.32336 | C: 0.20980
** [JOINT LOSS] ** : 0.821822
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009699 | Grad Max: 0.290844
  -> Layer: shared_layers.0.bias | Grad Mean: 0.863885 | Grad Max: 3.835822
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002067 | Grad Max: 0.005571
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000499 | Grad Max: 0.000499
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005464 | Grad Max: 0.707984
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.102292 | Grad Max: 3.909624
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000581 | Grad Max: 0.019454
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.061994 | Grad Max: 0.331032
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.000811
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012329 | Grad Max: 0.024060
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000713
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003182 | Grad Max: 0.011476
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001347 | Grad Max: 0.003843
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041738 | Grad Max: 0.041738
[GRADIENT NORM TOTAL] 18.2734

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.252
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5039877  0.49601227] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 719/1329 | B: 676/1372 | C: 659/1389
[LOSS Ex1] A: 0.62153 | B: 0.60433 | C: 0.60141
[LOGITS Ex2 A] Mean Abs: 2.153 | Max: 7.709
[LOSS Ex2] A: 0.09776 | B: 0.30647 | C: 0.20947
** [JOINT LOSS] ** : 0.813658
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002779 | Grad Max: 0.080755
  -> Layer: shared_layers.0.bias | Grad Mean: 0.137526 | Grad Max: 0.804731
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.006276
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005578 | Grad Max: 0.005578
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001008 | Grad Max: 0.205395
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.017566 | Grad Max: 1.158248
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.002540
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.003553 | Grad Max: 0.041280
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000113
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.000538 | Grad Max: 0.002948
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000125
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000133 | Grad Max: 0.001467
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000416 | Grad Max: 0.001415
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.001049 | Grad Max: 0.001049
[GRADIENT NORM TOTAL] 3.5706

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.291
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51209366 0.48790634] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 665/1383 | C: 685/1363
[LOSS Ex1] A: 0.62046 | B: 0.60456 | C: 0.59781
[LOGITS Ex2 A] Mean Abs: 2.200 | Max: 6.803
[LOSS Ex2] A: 0.12469 | B: 0.32587 | C: 0.22481
** [JOINT LOSS] ** : 0.832734
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011909 | Grad Max: 0.346940
  -> Layer: shared_layers.0.bias | Grad Mean: 1.006631 | Grad Max: 4.741487
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005736
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001583 | Grad Max: 0.001583
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006523 | Grad Max: 0.953069
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.119816 | Grad Max: 5.293045
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000631 | Grad Max: 0.018263
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.066879 | Grad Max: 0.330969
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.000988
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013616 | Grad Max: 0.027816
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000740
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003521 | Grad Max: 0.011463
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001468 | Grad Max: 0.003901
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.045794 | Grad Max: 0.045794
[GRADIENT NORM TOTAL] 22.2375

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.068
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004195  0.49958047] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 674/1374 | C: 682/1366
[LOSS Ex1] A: 0.62949 | B: 0.60004 | C: 0.59861
[LOGITS Ex2 A] Mean Abs: 2.199 | Max: 6.147
[LOSS Ex2] A: 0.10060 | B: 0.33004 | C: 0.22327
** [JOINT LOSS] ** : 0.827350
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011831 | Grad Max: 0.495697
  -> Layer: shared_layers.0.bias | Grad Mean: 1.300507 | Grad Max: 6.744269
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002107 | Grad Max: 0.005129
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000592 | Grad Max: 0.000592
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008131 | Grad Max: 1.193854
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.151456 | Grad Max: 6.680124
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000774 | Grad Max: 0.023513
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.083604 | Grad Max: 0.430340
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000109 | Grad Max: 0.001083
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.016764 | Grad Max: 0.031962
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000924
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004399 | Grad Max: 0.013944
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001806 | Grad Max: 0.004696
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057706 | Grad Max: 0.057706
[GRADIENT NORM TOTAL] 28.6012

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.957
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5411277  0.45887235] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 626/1230 | C: 653/1395
[LOSS Ex1] A: 0.63035 | B: 0.60398 | C: 0.60618
[LOGITS Ex2 A] Mean Abs: 2.167 | Max: 6.610
[LOSS Ex2] A: 0.10819 | B: 0.31185 | C: 0.21101
** [JOINT LOSS] ** : 0.823855
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007445 | Grad Max: 0.369206
  -> Layer: shared_layers.0.bias | Grad Mean: 0.929726 | Grad Max: 4.902701
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.005190
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008448 | Grad Max: 0.008448
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005785 | Grad Max: 0.861867
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.107875 | Grad Max: 4.777722
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000564 | Grad Max: 0.018446
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.061285 | Grad Max: 0.331534
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000782
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012272 | Grad Max: 0.024042
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000620
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003265 | Grad Max: 0.009971
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001351 | Grad Max: 0.003892
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043441 | Grad Max: 0.043441
[GRADIENT NORM TOTAL] 20.6043

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.205
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.83235884 0.16764121] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 676/1372 | C: 661/1387
[LOSS Ex1] A: 0.62365 | B: 0.60423 | C: 0.59803
[LOGITS Ex2 A] Mean Abs: 2.144 | Max: 7.407
[LOSS Ex2] A: 0.09185 | B: 0.30656 | C: 0.20691
** [JOINT LOSS] ** : 0.810412
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.002076 | Grad Max: 0.092863
  -> Layer: shared_layers.0.bias | Grad Mean: 0.219473 | Grad Max: 1.097072
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.005902
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006709 | Grad Max: 0.006709
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001438 | Grad Max: 0.281888
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.026424 | Grad Max: 1.562408
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.004788
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.011264 | Grad Max: 0.073815
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000200
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.002220 | Grad Max: 0.005439
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000164
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.000590 | Grad Max: 0.002139
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000479 | Grad Max: 0.001594
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.007801 | Grad Max: 0.007801
[GRADIENT NORM TOTAL] 5.5189

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.340
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50100964 0.49899033] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 665/1383 | C: 709/1339
[LOSS Ex1] A: 0.63098 | B: 0.60446 | C: 0.59885
[LOGITS Ex2 A] Mean Abs: 2.086 | Max: 5.655
[LOSS Ex2] A: 0.10284 | B: 0.33029 | C: 0.21974
** [JOINT LOSS] ** : 0.829058
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012635 | Grad Max: 0.329334
  -> Layer: shared_layers.0.bias | Grad Mean: 0.946360 | Grad Max: 4.471603
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.005470
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000405 | Grad Max: 0.000405
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005914 | Grad Max: 1.109685
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.109519 | Grad Max: 6.144238
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000596 | Grad Max: 0.018218
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.062609 | Grad Max: 0.316035
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000942
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012640 | Grad Max: 0.024925
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000708
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003307 | Grad Max: 0.011044
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001467 | Grad Max: 0.003986
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044371 | Grad Max: 0.044371
[GRADIENT NORM TOTAL] 20.6673

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.942
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7551747  0.24482533] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 674/1374 | C: 714/1334
[LOSS Ex1] A: 0.62587 | B: 0.59994 | C: 0.59296
[LOGITS Ex2 A] Mean Abs: 2.048 | Max: 6.952
[LOSS Ex2] A: 0.13758 | B: 0.33670 | C: 0.24319
** [JOINT LOSS] ** : 0.845414
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.019723 | Grad Max: 0.483343
  -> Layer: shared_layers.0.bias | Grad Mean: 1.338564 | Grad Max: 5.760572
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.005506
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007868 | Grad Max: 0.007868
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008505 | Grad Max: 1.278497
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.157512 | Grad Max: 7.074866
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000884 | Grad Max: 0.024046
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.092796 | Grad Max: 0.441927
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000129 | Grad Max: 0.001360
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.018974 | Grad Max: 0.036242
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.001067
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004980 | Grad Max: 0.016062
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002225 | Grad Max: 0.005274
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.067175 | Grad Max: 0.067175
[GRADIENT NORM TOTAL] 28.1168

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.098
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6481313 0.3518687] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 590/1026 | B: 626/1230 | C: 459/917
[LOSS Ex1] A: 0.62395 | B: 0.60388 | C: 0.59590
[LOGITS Ex2 A] Mean Abs: 2.094 | Max: 8.766
[LOSS Ex2] A: 0.12198 | B: 0.32832 | C: 0.21034
** [JOINT LOSS] ** : 0.828125
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.016165 | Grad Max: 0.405707
  -> Layer: shared_layers.0.bias | Grad Mean: 1.145751 | Grad Max: 5.206437
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005741
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008732 | Grad Max: 0.008732
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007477 | Grad Max: 1.100398
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.138557 | Grad Max: 6.091342
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000771 | Grad Max: 0.023124
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.081271 | Grad Max: 0.402823
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001275
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.016571 | Grad Max: 0.032262
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000943
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004323 | Grad Max: 0.014284
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001903 | Grad Max: 0.004730
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.057321 | Grad Max: 0.057321
[GRADIENT NORM TOTAL] 24.7965

[EPOCH SUMMARY] Train Loss: 0.8254

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.7993 | Alpha: 0.5500
No improve count: 7/15

############################## EPOCH 181/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.341
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50849855 0.49150142] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 676/1372 | C: 693/1355
[LOSS Ex1] A: 0.62473 | B: 0.60414 | C: 0.59546
[LOGITS Ex2 A] Mean Abs: 2.157 | Max: 8.328
[LOSS Ex2] A: 0.09471 | B: 0.30944 | C: 0.18726
** [JOINT LOSS] ** : 0.805250
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005664 | Grad Max: 0.186252
  -> Layer: shared_layers.0.bias | Grad Mean: 0.457221 | Grad Max: 2.534513
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.005753
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000278 | Grad Max: 0.000278
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002925 | Grad Max: 0.746389
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053859 | Grad Max: 4.133531
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000265 | Grad Max: 0.007987
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.028209 | Grad Max: 0.134859
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000482
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.005749 | Grad Max: 0.012015
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000352
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001476 | Grad Max: 0.005421
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000652 | Grad Max: 0.002723
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.018941 | Grad Max: 0.018941
[GRADIENT NORM TOTAL] 11.4711

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.255
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5039017  0.49609825] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 717/1331 | B: 665/1383 | C: 656/1392
[LOSS Ex1] A: 0.62140 | B: 0.60437 | C: 0.60388
[LOGITS Ex2 A] Mean Abs: 2.219 | Max: 6.596
[LOSS Ex2] A: 0.10139 | B: 0.31783 | C: 0.22297
** [JOINT LOSS] ** : 0.823947
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008539 | Grad Max: 0.279840
  -> Layer: shared_layers.0.bias | Grad Mean: 0.793455 | Grad Max: 3.857102
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005457
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000193 | Grad Max: 0.000193
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005153 | Grad Max: 0.856730
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.095923 | Grad Max: 4.726748
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000514 | Grad Max: 0.015879
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.055067 | Grad Max: 0.294880
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000761
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011015 | Grad Max: 0.021720
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000628
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002875 | Grad Max: 0.009213
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001173 | Grad Max: 0.003495
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037139 | Grad Max: 0.037139
[GRADIENT NORM TOTAL] 17.6390

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.294
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51216656 0.48783347] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 674/1374 | C: 744/1304
[LOSS Ex1] A: 0.62034 | B: 0.59986 | C: 0.59689
[LOGITS Ex2 A] Mean Abs: 2.225 | Max: 6.730
[LOSS Ex2] A: 0.12271 | B: 0.33230 | C: 0.24036
** [JOINT LOSS] ** : 0.837485
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014152 | Grad Max: 0.457765
  -> Layer: shared_layers.0.bias | Grad Mean: 1.273891 | Grad Max: 6.270208
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006460
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003907 | Grad Max: 0.003907
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008284 | Grad Max: 1.216679
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.153394 | Grad Max: 6.722294
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000831 | Grad Max: 0.024538
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.088969 | Grad Max: 0.441241
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000119 | Grad Max: 0.001370
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.018018 | Grad Max: 0.035067
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.001023
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004717 | Grad Max: 0.015310
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002023 | Grad Max: 0.004641
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.062207 | Grad Max: 0.062207
[GRADIENT NORM TOTAL] 27.7806

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.071
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003985  0.49960142] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 626/1230 | C: 680/1368
[LOSS Ex1] A: 0.62938 | B: 0.60379 | C: 0.60289
[LOGITS Ex2 A] Mean Abs: 2.194 | Max: 5.970
[LOSS Ex2] A: 0.10496 | B: 0.31596 | C: 0.22621
** [JOINT LOSS] ** : 0.827729
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012660 | Grad Max: 0.384954
  -> Layer: shared_layers.0.bias | Grad Mean: 1.058751 | Grad Max: 5.209836
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002006 | Grad Max: 0.005592
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009363 | Grad Max: 0.009363
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006749 | Grad Max: 1.008286
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.125143 | Grad Max: 5.641489
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000666 | Grad Max: 0.020895
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.070671 | Grad Max: 0.366014
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.000986
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.014341 | Grad Max: 0.027955
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000773
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003817 | Grad Max: 0.011849
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001591 | Grad Max: 0.004279
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050014 | Grad Max: 0.050014
[GRADIENT NORM TOTAL] 22.9888

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.959
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5410217 0.4589783] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 676/1372 | C: 706/1342
[LOSS Ex1] A: 0.63025 | B: 0.60406 | C: 0.59508
[LOGITS Ex2 A] Mean Abs: 2.155 | Max: 5.743
[LOSS Ex2] A: 0.10266 | B: 0.29375 | C: 0.22097
** [JOINT LOSS] ** : 0.815593
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004250 | Grad Max: 0.110176
  -> Layer: shared_layers.0.bias | Grad Mean: 0.274328 | Grad Max: 1.155234
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002049 | Grad Max: 0.005808
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011570 | Grad Max: 0.011570
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.001923 | Grad Max: 0.265539
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.034682 | Grad Max: 1.465674
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000197 | Grad Max: 0.005152
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.020679 | Grad Max: 0.093007
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000353
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.004337 | Grad Max: 0.009845
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000235
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001136 | Grad Max: 0.003302
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000475 | Grad Max: 0.001912
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.014468 | Grad Max: 0.014468
[GRADIENT NORM TOTAL] 6.0497

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.207
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8328265  0.16717349] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 665/1383 | C: 653/1395
[LOSS Ex1] A: 0.62354 | B: 0.60430 | C: 0.60497
[LOGITS Ex2 A] Mean Abs: 2.098 | Max: 8.174
[LOSS Ex2] A: 0.09703 | B: 0.32433 | C: 0.20397
** [JOINT LOSS] ** : 0.819378
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006866 | Grad Max: 0.343116
  -> Layer: shared_layers.0.bias | Grad Mean: 0.865469 | Grad Max: 4.591135
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005275
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000942 | Grad Max: 0.000942
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005267 | Grad Max: 1.021369
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.098702 | Grad Max: 5.628523
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000508 | Grad Max: 0.015962
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.055446 | Grad Max: 0.283971
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000732
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011034 | Grad Max: 0.022032
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000650
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002891 | Grad Max: 0.009746
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001219 | Grad Max: 0.003573
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038200 | Grad Max: 0.038200
[GRADIENT NORM TOTAL] 19.2770

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.342
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010339  0.49896613] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 674/1374 | C: 719/1329
[LOSS Ex1] A: 0.63088 | B: 0.59978 | C: 0.60187
[LOGITS Ex2 A] Mean Abs: 2.072 | Max: 5.962
[LOSS Ex2] A: 0.10446 | B: 0.34119 | C: 0.22165
** [JOINT LOSS] ** : 0.833278
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.014789 | Grad Max: 0.439451
  -> Layer: shared_layers.0.bias | Grad Mean: 1.276376 | Grad Max: 5.837375
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.004999
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000715 | Grad Max: 0.000715
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.008127 | Grad Max: 1.157853
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.151024 | Grad Max: 6.398695
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000830 | Grad Max: 0.023440
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.088756 | Grad Max: 0.438844
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000120 | Grad Max: 0.001290
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.017996 | Grad Max: 0.035083
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.001076
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004738 | Grad Max: 0.015818
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.002000 | Grad Max: 0.004744
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.062002 | Grad Max: 0.062002
[GRADIENT NORM TOTAL] 27.2938

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.944
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75549066 0.24450935] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 626/1230 | C: 774/1274
[LOSS Ex1] A: 0.62576 | B: 0.60372 | C: 0.59449
[LOGITS Ex2 A] Mean Abs: 2.044 | Max: 6.486
[LOSS Ex2] A: 0.12818 | B: 0.34887 | C: 0.22770
** [JOINT LOSS] ** : 0.842904
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.015264 | Grad Max: 0.424163
  -> Layer: shared_layers.0.bias | Grad Mean: 1.253330 | Grad Max: 5.622044
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.005281
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000437 | Grad Max: 0.000437
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007937 | Grad Max: 1.155499
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.147581 | Grad Max: 6.366715
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000821 | Grad Max: 0.024125
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.087292 | Grad Max: 0.428988
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000119 | Grad Max: 0.001286
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.017647 | Grad Max: 0.036735
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.001010
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.004645 | Grad Max: 0.015039
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001998 | Grad Max: 0.004693
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.061202 | Grad Max: 0.061202
[GRADIENT NORM TOTAL] 26.4208

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.100
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64820623 0.35179377] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 589/1027 | B: 676/1372 | C: 755/1293
[LOSS Ex1] A: 0.62385 | B: 0.60400 | C: 0.58828
[LOGITS Ex2 A] Mean Abs: 2.169 | Max: 10.493
[LOSS Ex2] A: 0.10490 | B: 0.32351 | C: 0.19386
** [JOINT LOSS] ** : 0.812794
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008289 | Grad Max: 0.261143
  -> Layer: shared_layers.0.bias | Grad Mean: 0.737776 | Grad Max: 3.470728
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.005817
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006050 | Grad Max: 0.006050
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004660 | Grad Max: 0.667757
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.086252 | Grad Max: 3.681400
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000480 | Grad Max: 0.014508
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.051287 | Grad Max: 0.256809
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000787
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010392 | Grad Max: 0.021093
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000629
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002715 | Grad Max: 0.009586
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001166 | Grad Max: 0.003540
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035673 | Grad Max: 0.035673
[GRADIENT NORM TOTAL] 15.6769

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.343
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084822  0.49151775] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 665/1383 | C: 746/1302
[LOSS Ex1] A: 0.62463 | B: 0.60423 | C: 0.59744
[LOGITS Ex2 A] Mean Abs: 2.210 | Max: 10.267
[LOSS Ex2] A: 0.09810 | B: 0.30835 | C: 0.21083
** [JOINT LOSS] ** : 0.814525
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006288 | Grad Max: 0.208296
  -> Layer: shared_layers.0.bias | Grad Mean: 0.572291 | Grad Max: 2.733333
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.005842
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000597 | Grad Max: 0.000597
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003602 | Grad Max: 0.675523
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.066386 | Grad Max: 3.778288
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000326 | Grad Max: 0.010676
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034805 | Grad Max: 0.193968
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000571
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007099 | Grad Max: 0.014458
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000408
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001900 | Grad Max: 0.006260
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000748 | Grad Max: 0.002647
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024381 | Grad Max: 0.024381
[GRADIENT NORM TOTAL] 13.2643

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.257
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037917  0.49620828] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.091
[MASKS] A(Pass/Fail): 717/1331 | B: 674/1374 | C: 719/1329
[LOSS Ex1] A: 0.62130 | B: 0.59972 | C: 0.60188
[LOGITS Ex2 A] Mean Abs: 2.243 | Max: 6.481
[LOSS Ex2] A: 0.10897 | B: 0.31553 | C: 0.24473
** [JOINT LOSS] ** : 0.830712
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.012194 | Grad Max: 0.437807
  -> Layer: shared_layers.0.bias | Grad Mean: 1.175393 | Grad Max: 5.886418
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002217 | Grad Max: 0.005743
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007046 | Grad Max: 0.007046
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.007387 | Grad Max: 1.249426
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.138085 | Grad Max: 6.945692
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000698 | Grad Max: 0.021153
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.074975 | Grad Max: 0.385964
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001112
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.015049 | Grad Max: 0.030175
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000838
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003945 | Grad Max: 0.013273
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001615 | Grad Max: 0.004017
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.050869 | Grad Max: 0.050869
[GRADIENT NORM TOTAL] 26.1125

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.296
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51224107 0.48775893] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 626/1230 | C: 713/1335
[LOSS Ex1] A: 0.62024 | B: 0.60365 | C: 0.60253
[LOGITS Ex2 A] Mean Abs: 2.210 | Max: 7.454
[LOSS Ex2] A: 0.12129 | B: 0.31529 | C: 0.23862
** [JOINT LOSS] ** : 0.833870
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011654 | Grad Max: 0.373325
  -> Layer: shared_layers.0.bias | Grad Mean: 1.038238 | Grad Max: 5.151225
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.005741
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000863 | Grad Max: 0.000863
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006463 | Grad Max: 1.179326
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.120214 | Grad Max: 6.546393
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000632 | Grad Max: 0.019247
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.067839 | Grad Max: 0.344869
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.000923
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013626 | Grad Max: 0.026547
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000743
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003584 | Grad Max: 0.011475
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001474 | Grad Max: 0.003978
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046171 | Grad Max: 0.046171
[GRADIENT NORM TOTAL] 22.9715

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.072
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50040674 0.49959326] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 676/1372 | C: 728/1320
[LOSS Ex1] A: 0.62927 | B: 0.60394 | C: 0.60376
[LOGITS Ex2 A] Mean Abs: 2.151 | Max: 5.053
[LOSS Ex2] A: 0.10295 | B: 0.30788 | C: 0.22525
** [JOINT LOSS] ** : 0.824350
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006617 | Grad Max: 0.223704
  -> Layer: shared_layers.0.bias | Grad Mean: 0.552977 | Grad Max: 2.729132
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.005230
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000611 | Grad Max: 0.000611
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003421 | Grad Max: 0.431713
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064203 | Grad Max: 2.408695
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000352 | Grad Max: 0.010534
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037502 | Grad Max: 0.187746
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000550
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007374 | Grad Max: 0.014775
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000425
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001947 | Grad Max: 0.006431
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000806 | Grad Max: 0.002721
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025796 | Grad Max: 0.025796
[GRADIENT NORM TOTAL] 11.7462

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409509  0.45904914] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 665/1383 | C: 474/902
[LOSS Ex1] A: 0.63015 | B: 0.60417 | C: 0.59800
[LOGITS Ex2 A] Mean Abs: 2.068 | Max: 5.463
[LOSS Ex2] A: 0.10427 | B: 0.31660 | C: 0.19921
** [JOINT LOSS] ** : 0.817461
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007959 | Grad Max: 0.201843
  -> Layer: shared_layers.0.bias | Grad Mean: 0.598077 | Grad Max: 2.708833
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.005533
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008923 | Grad Max: 0.008923
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003854 | Grad Max: 0.765684
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070968 | Grad Max: 4.276421
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000376 | Grad Max: 0.011868
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039793 | Grad Max: 0.195335
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000622
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008250 | Grad Max: 0.016679
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000486
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002204 | Grad Max: 0.007579
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000980 | Grad Max: 0.003380
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029630 | Grad Max: 0.029630
[GRADIENT NORM TOTAL] 13.3572

[EPOCH SUMMARY] Train Loss: 0.8242

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8153 | Alpha: 0.5500
No improve count: 8/15

############################## EPOCH 182/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.83318764 0.16681242] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 674/1374 | C: 729/1319
[LOSS Ex1] A: 0.62344 | B: 0.59966 | C: 0.60014
[LOGITS Ex2 A] Mean Abs: 2.086 | Max: 6.796
[LOSS Ex2] A: 0.09502 | B: 0.31801 | C: 0.21735
** [JOINT LOSS] ** : 0.817877
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009238 | Grad Max: 0.328264
  -> Layer: shared_layers.0.bias | Grad Mean: 0.918675 | Grad Max: 4.423221
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005750
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000387 | Grad Max: 0.000387
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005689 | Grad Max: 1.037361
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.106070 | Grad Max: 5.735444
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000566 | Grad Max: 0.017584
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.061050 | Grad Max: 0.311344
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000891
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012377 | Grad Max: 0.024518
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000710
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003273 | Grad Max: 0.011049
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001386 | Grad Max: 0.003822
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042882 | Grad Max: 0.042882
[GRADIENT NORM TOTAL] 20.1041

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010789  0.49892107] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 626/1230 | C: 712/1336
[LOSS Ex1] A: 0.63079 | B: 0.60360 | C: 0.59879
[LOGITS Ex2 A] Mean Abs: 2.119 | Max: 6.312
[LOSS Ex2] A: 0.09738 | B: 0.33013 | C: 0.20975
** [JOINT LOSS] ** : 0.823484
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009656 | Grad Max: 0.319118
  -> Layer: shared_layers.0.bias | Grad Mean: 0.954306 | Grad Max: 4.329672
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.005557
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002628 | Grad Max: 0.002628
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005919 | Grad Max: 0.947362
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.109781 | Grad Max: 5.246794
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000596 | Grad Max: 0.018699
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.063916 | Grad Max: 0.325148
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.000907
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012848 | Grad Max: 0.026566
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000700
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003360 | Grad Max: 0.011308
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001391 | Grad Max: 0.003764
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042972 | Grad Max: 0.042972
[GRADIENT NORM TOTAL] 20.5628

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7557117  0.24428837] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 676/1372 | C: 725/1323
[LOSS Ex1] A: 0.62568 | B: 0.60391 | C: 0.59543
[LOGITS Ex2 A] Mean Abs: 2.078 | Max: 6.209
[LOSS Ex2] A: 0.11882 | B: 0.34099 | C: 0.20717
** [JOINT LOSS] ** : 0.830667
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011517 | Grad Max: 0.355626
  -> Layer: shared_layers.0.bias | Grad Mean: 1.023392 | Grad Max: 4.862053
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.005331
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004903 | Grad Max: 0.004903
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006347 | Grad Max: 1.078242
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.118241 | Grad Max: 5.969844
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000631 | Grad Max: 0.019100
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.067553 | Grad Max: 0.353090
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.000975
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013604 | Grad Max: 0.026864
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000768
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003589 | Grad Max: 0.011984
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001513 | Grad Max: 0.003931
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.046795 | Grad Max: 0.046795
[GRADIENT NORM TOTAL] 22.3638

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482954 0.3517046] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 588/1028 | B: 665/1383 | C: 766/1282
[LOSS Ex1] A: 0.62377 | B: 0.60415 | C: 0.59766
[LOGITS Ex2 A] Mean Abs: 2.111 | Max: 9.845
[LOSS Ex2] A: 0.10910 | B: 0.34055 | C: 0.21249
** [JOINT LOSS] ** : 0.829243
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011286 | Grad Max: 0.358085
  -> Layer: shared_layers.0.bias | Grad Mean: 1.018807 | Grad Max: 4.721560
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.005758
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011130 | Grad Max: 0.011130
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006296 | Grad Max: 1.036957
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.117389 | Grad Max: 5.750445
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000631 | Grad Max: 0.017356
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.067817 | Grad Max: 0.318531
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001014
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013719 | Grad Max: 0.026741
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000830
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003633 | Grad Max: 0.012140
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001582 | Grad Max: 0.004081
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048661 | Grad Max: 0.048661
[GRADIENT NORM TOTAL] 22.0855

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085099 0.4914901] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 674/1374 | C: 721/1327
[LOSS Ex1] A: 0.62457 | B: 0.59966 | C: 0.59627
[LOGITS Ex2 A] Mean Abs: 2.114 | Max: 8.222
[LOSS Ex2] A: 0.10138 | B: 0.31365 | C: 0.22196
** [JOINT LOSS] ** : 0.819165
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010435 | Grad Max: 0.318584
  -> Layer: shared_layers.0.bias | Grad Mean: 0.916278 | Grad Max: 4.278434
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.005338
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001219 | Grad Max: 0.001219
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005772 | Grad Max: 0.947616
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.107413 | Grad Max: 5.256381
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000580 | Grad Max: 0.017105
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.062114 | Grad Max: 0.298469
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000920
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012585 | Grad Max: 0.024262
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000724
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003311 | Grad Max: 0.011043
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001422 | Grad Max: 0.003908
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043452 | Grad Max: 0.043452
[GRADIENT NORM TOTAL] 19.9367

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037942  0.49620578] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.091
[MASKS] A(Pass/Fail): 717/1331 | B: 626/1230 | C: 760/1288
[LOSS Ex1] A: 0.62126 | B: 0.60360 | C: 0.59082
[LOGITS Ex2 A] Mean Abs: 2.111 | Max: 7.276
[LOSS Ex2] A: 0.10507 | B: 0.32218 | C: 0.20565
** [JOINT LOSS] ** : 0.816194
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009025 | Grad Max: 0.312053
  -> Layer: shared_layers.0.bias | Grad Mean: 0.871481 | Grad Max: 4.107564
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002292 | Grad Max: 0.006247
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008052 | Grad Max: 0.008052
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005407 | Grad Max: 0.914251
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.100590 | Grad Max: 5.046986
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000531 | Grad Max: 0.014918
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.057300 | Grad Max: 0.286131
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000849
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011521 | Grad Max: 0.022789
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000668
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003007 | Grad Max: 0.010157
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001251 | Grad Max: 0.003456
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038673 | Grad Max: 0.038673
[GRADIENT NORM TOTAL] 18.9663

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122487 0.4877513] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 676/1372 | C: 739/1309
[LOSS Ex1] A: 0.62021 | B: 0.60391 | C: 0.59352
[LOGITS Ex2 A] Mean Abs: 2.074 | Max: 5.524
[LOSS Ex2] A: 0.11306 | B: 0.34876 | C: 0.21497
** [JOINT LOSS] ** : 0.831477
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009067 | Grad Max: 0.375805
  -> Layer: shared_layers.0.bias | Grad Mean: 0.995166 | Grad Max: 4.933049
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.005760
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002888 | Grad Max: 0.002888
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006170 | Grad Max: 1.024768
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.115370 | Grad Max: 5.675250
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000607 | Grad Max: 0.019684
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.065863 | Grad Max: 0.339035
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.000890
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013216 | Grad Max: 0.025752
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000755
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003535 | Grad Max: 0.011514
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001502 | Grad Max: 0.003905
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047425 | Grad Max: 0.047425
[GRADIENT NORM TOTAL] 21.8813

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004019  0.49959806] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 665/1383 | C: 736/1312
[LOSS Ex1] A: 0.62925 | B: 0.60415 | C: 0.60157
[LOGITS Ex2 A] Mean Abs: 2.053 | Max: 5.597
[LOSS Ex2] A: 0.10599 | B: 0.33440 | C: 0.21214
** [JOINT LOSS] ** : 0.829170
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009965 | Grad Max: 0.349314
  -> Layer: shared_layers.0.bias | Grad Mean: 0.980156 | Grad Max: 4.550669
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.005962
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009470 | Grad Max: 0.009470
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006099 | Grad Max: 1.020183
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.113485 | Grad Max: 5.648369
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000612 | Grad Max: 0.017375
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.065767 | Grad Max: 0.329037
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.000966
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013337 | Grad Max: 0.027341
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000768
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003555 | Grad Max: 0.011665
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001552 | Grad Max: 0.004132
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047783 | Grad Max: 0.047783
[GRADIENT NORM TOTAL] 21.2617

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409568 0.4590432] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 674/1374 | C: 700/1348
[LOSS Ex1] A: 0.63014 | B: 0.59966 | C: 0.60361
[LOGITS Ex2 A] Mean Abs: 2.040 | Max: 5.775
[LOSS Ex2] A: 0.11307 | B: 0.32230 | C: 0.22074
** [JOINT LOSS] ** : 0.829839
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009811 | Grad Max: 0.347796
  -> Layer: shared_layers.0.bias | Grad Mean: 0.946211 | Grad Max: 4.524896
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.005533
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010028 | Grad Max: 0.010028
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005898 | Grad Max: 1.020629
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.109581 | Grad Max: 5.628964
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000589 | Grad Max: 0.017480
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.063492 | Grad Max: 0.309484
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000910
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012827 | Grad Max: 0.025105
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000793
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003381 | Grad Max: 0.011985
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001432 | Grad Max: 0.003847
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044663 | Grad Max: 0.044663
[GRADIENT NORM TOTAL] 20.6961

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8331878  0.16681226] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 626/1230 | C: 720/1328
[LOSS Ex1] A: 0.62344 | B: 0.60360 | C: 0.59837
[LOGITS Ex2 A] Mean Abs: 2.083 | Max: 7.577
[LOSS Ex2] A: 0.10373 | B: 0.31781 | C: 0.20114
** [JOINT LOSS] ** : 0.816032
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008428 | Grad Max: 0.328984
  -> Layer: shared_layers.0.bias | Grad Mean: 0.900294 | Grad Max: 4.416506
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.005402
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004483 | Grad Max: 0.004483
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005546 | Grad Max: 0.935756
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.103707 | Grad Max: 5.182055
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000550 | Grad Max: 0.015305
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.059726 | Grad Max: 0.292637
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000788
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011979 | Grad Max: 0.023295
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000667
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003127 | Grad Max: 0.010713
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001274 | Grad Max: 0.003580
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040132 | Grad Max: 0.040132
[GRADIENT NORM TOTAL] 19.7080

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.501079 0.498921] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 676/1372 | C: 723/1325
[LOSS Ex1] A: 0.63079 | B: 0.60391 | C: 0.60271
[LOGITS Ex2 A] Mean Abs: 2.098 | Max: 6.690
[LOSS Ex2] A: 0.09511 | B: 0.34616 | C: 0.22206
** [JOINT LOSS] ** : 0.833579
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010202 | Grad Max: 0.356772
  -> Layer: shared_layers.0.bias | Grad Mean: 0.983559 | Grad Max: 4.595041
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001994 | Grad Max: 0.005807
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001700 | Grad Max: 0.001700
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006098 | Grad Max: 0.915612
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.113111 | Grad Max: 5.116548
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000608 | Grad Max: 0.019124
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.065711 | Grad Max: 0.330930
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.000898
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013364 | Grad Max: 0.025855
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000766
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003585 | Grad Max: 0.012157
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001525 | Grad Max: 0.003989
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047915 | Grad Max: 0.047915
[GRADIENT NORM TOTAL] 21.1086

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7557115 0.2442885] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 665/1383 | C: 715/1333
[LOSS Ex1] A: 0.62568 | B: 0.60415 | C: 0.60103
[LOGITS Ex2 A] Mean Abs: 2.067 | Max: 6.967
[LOSS Ex2] A: 0.11569 | B: 0.33727 | C: 0.22225
** [JOINT LOSS] ** : 0.835356
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011395 | Grad Max: 0.336011
  -> Layer: shared_layers.0.bias | Grad Mean: 0.978288 | Grad Max: 4.492485
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005567
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005836 | Grad Max: 0.005836
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006125 | Grad Max: 0.872082
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.113528 | Grad Max: 4.914037
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000624 | Grad Max: 0.021101
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.066747 | Grad Max: 0.359656
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001020
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013519 | Grad Max: 0.026197
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000841
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003564 | Grad Max: 0.011954
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001542 | Grad Max: 0.004162
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.047386 | Grad Max: 0.047386
[GRADIENT NORM TOTAL] 20.8331

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482948  0.35170516] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 588/1028 | B: 674/1374 | C: 716/1332
[LOSS Ex1] A: 0.62377 | B: 0.59966 | C: 0.60198
[LOGITS Ex2 A] Mean Abs: 2.112 | Max: 8.367
[LOSS Ex2] A: 0.10073 | B: 0.31615 | C: 0.20904
** [JOINT LOSS] ** : 0.817113
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010066 | Grad Max: 0.308824
  -> Layer: shared_layers.0.bias | Grad Mean: 0.888428 | Grad Max: 4.137380
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005127
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003365 | Grad Max: 0.003365
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005545 | Grad Max: 0.812196
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.102532 | Grad Max: 4.601460
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000551 | Grad Max: 0.016639
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.059425 | Grad Max: 0.288355
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000912
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012101 | Grad Max: 0.025264
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000695
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003185 | Grad Max: 0.011023
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001357 | Grad Max: 0.003907
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041584 | Grad Max: 0.041584
[GRADIENT NORM TOTAL] 19.0164

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085098  0.49149016] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 626/1230 | C: 474/902
[LOSS Ex1] A: 0.62457 | B: 0.60360 | C: 0.60697
[LOGITS Ex2 A] Mean Abs: 2.110 | Max: 8.889
[LOSS Ex2] A: 0.09943 | B: 0.32316 | C: 0.23344
** [JOINT LOSS] ** : 0.830394
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009674 | Grad Max: 0.309412
  -> Layer: shared_layers.0.bias | Grad Mean: 0.880932 | Grad Max: 4.032309
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.005147
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002907 | Grad Max: 0.002907
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005584 | Grad Max: 0.737628
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.103539 | Grad Max: 4.133241
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000572 | Grad Max: 0.018245
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.061285 | Grad Max: 0.313257
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.000909
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012459 | Grad Max: 0.024784
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000687
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003309 | Grad Max: 0.010788
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001439 | Grad Max: 0.003741
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044072 | Grad Max: 0.044072
[GRADIENT NORM TOTAL] 18.7380

[EPOCH SUMMARY] Train Loss: 0.8257

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8139 | Alpha: 0.5500
No improve count: 9/15

############################## EPOCH 183/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037937  0.49620625] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.091
[MASKS] A(Pass/Fail): 717/1331 | B: 676/1372 | C: 715/1333
[LOSS Ex1] A: 0.62126 | B: 0.60391 | C: 0.59680
[LOGITS Ex2 A] Mean Abs: 2.113 | Max: 7.228
[LOSS Ex2] A: 0.10723 | B: 0.34330 | C: 0.20203
** [JOINT LOSS] ** : 0.824839
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009334 | Grad Max: 0.332464
  -> Layer: shared_layers.0.bias | Grad Mean: 0.938294 | Grad Max: 4.528007
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.006025
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000711 | Grad Max: 0.000711
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005775 | Grad Max: 0.839907
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.107061 | Grad Max: 4.705668
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000562 | Grad Max: 0.016851
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.060957 | Grad Max: 0.301503
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000920
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012342 | Grad Max: 0.023988
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000704
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003260 | Grad Max: 0.010736
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001357 | Grad Max: 0.003755
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042467 | Grad Max: 0.042467
[GRADIENT NORM TOTAL] 20.3454

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51224893 0.4877511 ] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 665/1383 | C: 750/1298
[LOSS Ex1] A: 0.62021 | B: 0.60415 | C: 0.59510
[LOGITS Ex2 A] Mean Abs: 2.069 | Max: 6.179
[LOSS Ex2] A: 0.11583 | B: 0.33434 | C: 0.21492
** [JOINT LOSS] ** : 0.828186
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010901 | Grad Max: 0.333725
  -> Layer: shared_layers.0.bias | Grad Mean: 0.966557 | Grad Max: 4.477579
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.006035
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002211 | Grad Max: 0.002211
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005926 | Grad Max: 0.819668
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.110791 | Grad Max: 4.579196
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000603 | Grad Max: 0.017248
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.064529 | Grad Max: 0.322295
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.000898
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012978 | Grad Max: 0.025532
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000770
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003399 | Grad Max: 0.011905
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001469 | Grad Max: 0.004076
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044803 | Grad Max: 0.044803
[GRADIENT NORM TOTAL] 20.4285

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.500402 0.499598] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 674/1374 | C: 745/1303
[LOSS Ex1] A: 0.62925 | B: 0.59966 | C: 0.59992
[LOGITS Ex2 A] Mean Abs: 2.067 | Max: 6.821
[LOSS Ex2] A: 0.10496 | B: 0.31074 | C: 0.22393
** [JOINT LOSS] ** : 0.822823
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009373 | Grad Max: 0.306000
  -> Layer: shared_layers.0.bias | Grad Mean: 0.875792 | Grad Max: 4.080841
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.005388
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001553 | Grad Max: 0.001553
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005526 | Grad Max: 0.851832
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.102439 | Grad Max: 4.810162
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000552 | Grad Max: 0.016503
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.059435 | Grad Max: 0.309648
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000907
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012044 | Grad Max: 0.024258
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000715
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003143 | Grad Max: 0.010951
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001333 | Grad Max: 0.003885
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040906 | Grad Max: 0.040906
[GRADIENT NORM TOTAL] 18.9220

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.540956   0.45904404] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 626/1230 | C: 695/1353
[LOSS Ex1] A: 0.63014 | B: 0.60360 | C: 0.60423
[LOGITS Ex2 A] Mean Abs: 2.035 | Max: 5.606
[LOSS Ex2] A: 0.11250 | B: 0.32111 | C: 0.21203
** [JOINT LOSS] ** : 0.827867
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010285 | Grad Max: 0.310242
  -> Layer: shared_layers.0.bias | Grad Mean: 0.919015 | Grad Max: 4.116593
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002024 | Grad Max: 0.005194
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005020 | Grad Max: 0.005020
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005744 | Grad Max: 0.756661
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.106334 | Grad Max: 4.267666
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000593 | Grad Max: 0.016937
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.063877 | Grad Max: 0.307721
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.000957
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012962 | Grad Max: 0.026166
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000778
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003417 | Grad Max: 0.012093
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001425 | Grad Max: 0.004065
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044484 | Grad Max: 0.044484
[GRADIENT NORM TOTAL] 19.4332

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.83318764 0.16681238] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 676/1372 | C: 743/1305
[LOSS Ex1] A: 0.62344 | B: 0.60391 | C: 0.59136
[LOGITS Ex2 A] Mean Abs: 2.096 | Max: 8.004
[LOSS Ex2] A: 0.09998 | B: 0.34010 | C: 0.20492
** [JOINT LOSS] ** : 0.821238
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008161 | Grad Max: 0.322716
  -> Layer: shared_layers.0.bias | Grad Mean: 0.892944 | Grad Max: 4.386117
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002179 | Grad Max: 0.005812
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002130 | Grad Max: 0.002130
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005472 | Grad Max: 0.811661
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.102479 | Grad Max: 4.536824
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000532 | Grad Max: 0.016609
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.057735 | Grad Max: 0.304283
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000758
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011627 | Grad Max: 0.022511
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000698
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003087 | Grad Max: 0.010440
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001329 | Grad Max: 0.003668
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041215 | Grad Max: 0.041215
[GRADIENT NORM TOTAL] 19.4958

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50107914 0.49892083] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 665/1383 | C: 726/1322
[LOSS Ex1] A: 0.63079 | B: 0.60415 | C: 0.60011
[LOGITS Ex2 A] Mean Abs: 2.092 | Max: 6.269
[LOSS Ex2] A: 0.09521 | B: 0.32931 | C: 0.20837
** [JOINT LOSS] ** : 0.822647
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009563 | Grad Max: 0.310565
  -> Layer: shared_layers.0.bias | Grad Mean: 0.876339 | Grad Max: 4.038787
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002003 | Grad Max: 0.004859
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002812 | Grad Max: 0.002812
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005472 | Grad Max: 0.797187
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.101483 | Grad Max: 4.473042
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000551 | Grad Max: 0.016415
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.059399 | Grad Max: 0.296855
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000846
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012066 | Grad Max: 0.023551
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000717
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003188 | Grad Max: 0.011172
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001360 | Grad Max: 0.003991
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.042233 | Grad Max: 0.042233
[GRADIENT NORM TOTAL] 18.7568

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.755711   0.24428894] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 674/1374 | C: 724/1324
[LOSS Ex1] A: 0.62568 | B: 0.59966 | C: 0.59840
[LOGITS Ex2 A] Mean Abs: 2.071 | Max: 7.110
[LOSS Ex2] A: 0.11372 | B: 0.31499 | C: 0.21961
** [JOINT LOSS] ** : 0.824020
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009756 | Grad Max: 0.310434
  -> Layer: shared_layers.0.bias | Grad Mean: 0.878269 | Grad Max: 4.071934
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.005580
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002244 | Grad Max: 0.002244
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005530 | Grad Max: 0.815279
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.102557 | Grad Max: 4.572691
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000552 | Grad Max: 0.016188
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.059350 | Grad Max: 0.292347
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000862
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012019 | Grad Max: 0.024486
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000720
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003171 | Grad Max: 0.010825
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001363 | Grad Max: 0.003825
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041990 | Grad Max: 0.041990
[GRADIENT NORM TOTAL] 18.9368

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64829385 0.35170615] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 588/1028 | B: 626/1230 | C: 741/1307
[LOSS Ex1] A: 0.62377 | B: 0.60360 | C: 0.59688
[LOGITS Ex2 A] Mean Abs: 2.113 | Max: 8.321
[LOSS Ex2] A: 0.10375 | B: 0.32180 | C: 0.19963
** [JOINT LOSS] ** : 0.816480
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010277 | Grad Max: 0.292904
  -> Layer: shared_layers.0.bias | Grad Mean: 0.897750 | Grad Max: 3.931839
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.006046
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002853 | Grad Max: 0.002853
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005605 | Grad Max: 0.822566
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.104406 | Grad Max: 4.623491
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000580 | Grad Max: 0.018044
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.062167 | Grad Max: 0.316162
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000937
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012557 | Grad Max: 0.024992
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000739
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003267 | Grad Max: 0.011143
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001357 | Grad Max: 0.003798
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041569 | Grad Max: 0.041569
[GRADIENT NORM TOTAL] 19.1422

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085097 0.4914903] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 676/1372 | C: 718/1330
[LOSS Ex1] A: 0.62457 | B: 0.60391 | C: 0.59968
[LOGITS Ex2 A] Mean Abs: 2.116 | Max: 8.602
[LOSS Ex2] A: 0.09834 | B: 0.34199 | C: 0.18662
** [JOINT LOSS] ** : 0.818368
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010389 | Grad Max: 0.324354
  -> Layer: shared_layers.0.bias | Grad Mean: 0.921837 | Grad Max: 4.395842
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.005441
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001594 | Grad Max: 0.001594
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005794 | Grad Max: 0.868114
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.107543 | Grad Max: 4.874303
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000578 | Grad Max: 0.017294
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.061953 | Grad Max: 0.307217
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000949
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012583 | Grad Max: 0.024566
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000693
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003316 | Grad Max: 0.011122
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001413 | Grad Max: 0.003949
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043667 | Grad Max: 0.043667
[GRADIENT NORM TOTAL] 20.0238

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037931 0.4962069] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.091
[MASKS] A(Pass/Fail): 717/1331 | B: 665/1383 | C: 744/1304
[LOSS Ex1] A: 0.62126 | B: 0.60415 | C: 0.59786
[LOGITS Ex2 A] Mean Abs: 2.104 | Max: 7.336
[LOSS Ex2] A: 0.10448 | B: 0.33614 | C: 0.23011
** [JOINT LOSS] ** : 0.831332
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009094 | Grad Max: 0.321649
  -> Layer: shared_layers.0.bias | Grad Mean: 0.882024 | Grad Max: 4.122788
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.005471
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001260 | Grad Max: 0.001260
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005469 | Grad Max: 0.873132
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.101688 | Grad Max: 4.878801
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000539 | Grad Max: 0.017371
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.058375 | Grad Max: 0.302425
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000792
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011873 | Grad Max: 0.022531
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000691
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003203 | Grad Max: 0.010242
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001405 | Grad Max: 0.003581
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043776 | Grad Max: 0.043776
[GRADIENT NORM TOTAL] 19.0630

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122492 0.4877508] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 674/1374 | C: 747/1301
[LOSS Ex1] A: 0.62021 | B: 0.59966 | C: 0.59307
[LOGITS Ex2 A] Mean Abs: 2.066 | Max: 6.322
[LOSS Ex2] A: 0.11115 | B: 0.30592 | C: 0.19840
** [JOINT LOSS] ** : 0.809473
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008144 | Grad Max: 0.302986
  -> Layer: shared_layers.0.bias | Grad Mean: 0.860414 | Grad Max: 4.048113
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.005970
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002102 | Grad Max: 0.002102
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005298 | Grad Max: 0.800495
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.099345 | Grad Max: 4.470567
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000537 | Grad Max: 0.016462
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.058317 | Grad Max: 0.299827
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000791
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011596 | Grad Max: 0.023170
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000723
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003039 | Grad Max: 0.010456
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001256 | Grad Max: 0.003923
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039164 | Grad Max: 0.039164
[GRADIENT NORM TOTAL] 18.5137

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004021 0.4995979] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 626/1230 | C: 731/1317
[LOSS Ex1] A: 0.62925 | B: 0.60360 | C: 0.60335
[LOGITS Ex2 A] Mean Abs: 2.053 | Max: 5.645
[LOSS Ex2] A: 0.10408 | B: 0.31809 | C: 0.21458
** [JOINT LOSS] ** : 0.824320
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.010114 | Grad Max: 0.302774
  -> Layer: shared_layers.0.bias | Grad Mean: 0.876203 | Grad Max: 3.934770
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002019 | Grad Max: 0.005316
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004716 | Grad Max: 0.004716
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005553 | Grad Max: 0.780036
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.102846 | Grad Max: 4.394171
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000570 | Grad Max: 0.016756
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.061200 | Grad Max: 0.296207
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000918
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012506 | Grad Max: 0.025189
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000734
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003296 | Grad Max: 0.011185
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001413 | Grad Max: 0.004109
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.043604 | Grad Max: 0.043604
[GRADIENT NORM TOTAL] 18.7068

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54095495 0.45904502] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 676/1372 | C: 677/1371
[LOSS Ex1] A: 0.63014 | B: 0.60391 | C: 0.60701
[LOGITS Ex2 A] Mean Abs: 2.059 | Max: 5.764
[LOSS Ex2] A: 0.10968 | B: 0.33795 | C: 0.23534
** [JOINT LOSS] ** : 0.841344
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009497 | Grad Max: 0.339891
  -> Layer: shared_layers.0.bias | Grad Mean: 0.966417 | Grad Max: 4.681437
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001971 | Grad Max: 0.005532
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011534 | Grad Max: 0.011534
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005968 | Grad Max: 0.870153
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.110782 | Grad Max: 4.911767
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000589 | Grad Max: 0.019101
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.063528 | Grad Max: 0.324871
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000843
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012811 | Grad Max: 0.025283
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000693
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003403 | Grad Max: 0.011291
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001440 | Grad Max: 0.003745
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.044983 | Grad Max: 0.044983
[GRADIENT NORM TOTAL] 20.9693

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.83318746 0.16681258] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 665/1383 | C: 480/896
[LOSS Ex1] A: 0.62344 | B: 0.60415 | C: 0.60416
[LOGITS Ex2 A] Mean Abs: 2.106 | Max: 6.967
[LOSS Ex2] A: 0.09778 | B: 0.32644 | C: 0.26855
** [JOINT LOSS] ** : 0.841509
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008264 | Grad Max: 0.317328
  -> Layer: shared_layers.0.bias | Grad Mean: 0.859963 | Grad Max: 4.169960
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002089 | Grad Max: 0.005029
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003040 | Grad Max: 0.003040
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005218 | Grad Max: 0.710026
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.097425 | Grad Max: 3.960751
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000524 | Grad Max: 0.015606
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.056694 | Grad Max: 0.298656
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000764
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011452 | Grad Max: 0.021593
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000642
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003046 | Grad Max: 0.010069
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001301 | Grad Max: 0.003430
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040722 | Grad Max: 0.040722
[GRADIENT NORM TOTAL] 18.3270

[EPOCH SUMMARY] Train Loss: 0.8253

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8121 | Alpha: 0.5500
No improve count: 10/15

############################## EPOCH 184/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010793  0.49892068] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 674/1374 | C: 744/1304
[LOSS Ex1] A: 0.63079 | B: 0.59966 | C: 0.60197
[LOGITS Ex2 A] Mean Abs: 2.099 | Max: 6.365
[LOSS Ex2] A: 0.09877 | B: 0.31336 | C: 0.21663
** [JOINT LOSS] ** : 0.820393
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008991 | Grad Max: 0.287714
  -> Layer: shared_layers.0.bias | Grad Mean: 0.792975 | Grad Max: 3.824040
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002021 | Grad Max: 0.004775
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001921 | Grad Max: 0.001921
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004940 | Grad Max: 0.833165
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.091369 | Grad Max: 4.652308
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000485 | Grad Max: 0.015184
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.052123 | Grad Max: 0.273914
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000775
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010683 | Grad Max: 0.020408
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000644
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002869 | Grad Max: 0.009540
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001247 | Grad Max: 0.003620
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038416 | Grad Max: 0.038416
[GRADIENT NORM TOTAL] 17.2122

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75571054 0.24428946] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 626/1230 | C: 763/1285
[LOSS Ex1] A: 0.62568 | B: 0.60360 | C: 0.59029
[LOGITS Ex2 A] Mean Abs: 2.076 | Max: 6.722
[LOSS Ex2] A: 0.11481 | B: 0.32394 | C: 0.21180
** [JOINT LOSS] ** : 0.823375
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011050 | Grad Max: 0.298582
  -> Layer: shared_layers.0.bias | Grad Mean: 0.882616 | Grad Max: 3.877453
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.005589
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006417 | Grad Max: 0.006417
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005433 | Grad Max: 0.713624
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.100776 | Grad Max: 4.016646
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000558 | Grad Max: 0.017291
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.059524 | Grad Max: 0.307763
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000872
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.012068 | Grad Max: 0.023972
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000783
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003188 | Grad Max: 0.011156
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001357 | Grad Max: 0.003831
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041661 | Grad Max: 0.041661
[GRADIENT NORM TOTAL] 18.3890

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482926  0.35170737] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 588/1028 | B: 676/1372 | C: 745/1303
[LOSS Ex1] A: 0.62377 | B: 0.60391 | C: 0.59754
[LOGITS Ex2 A] Mean Abs: 2.129 | Max: 8.737
[LOSS Ex2] A: 0.10785 | B: 0.34019 | C: 0.22295
** [JOINT LOSS] ** : 0.832070
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.011178 | Grad Max: 0.338813
  -> Layer: shared_layers.0.bias | Grad Mean: 0.964782 | Grad Max: 4.539521
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005937
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011279 | Grad Max: 0.011279
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.006083 | Grad Max: 0.918166
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.112702 | Grad Max: 5.181048
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000615 | Grad Max: 0.018260
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.065721 | Grad Max: 0.324956
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000944
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.013419 | Grad Max: 0.026101
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000782
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003578 | Grad Max: 0.011257
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001561 | Grad Max: 0.003986
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.048001 | Grad Max: 0.048001
[GRADIENT NORM TOTAL] 20.7338

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085096  0.49149042] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 665/1383 | C: 705/1343
[LOSS Ex1] A: 0.62457 | B: 0.60415 | C: 0.59527
[LOGITS Ex2 A] Mean Abs: 2.125 | Max: 7.715
[LOSS Ex2] A: 0.09510 | B: 0.33043 | C: 0.22165
** [JOINT LOSS] ** : 0.823726
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009874 | Grad Max: 0.293107
  -> Layer: shared_layers.0.bias | Grad Mean: 0.870429 | Grad Max: 3.904039
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.005148
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005409 | Grad Max: 0.005409
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005377 | Grad Max: 0.739932
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.099758 | Grad Max: 4.137673
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000542 | Grad Max: 0.015974
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.058201 | Grad Max: 0.276935
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000856
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011838 | Grad Max: 0.023370
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000732
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003120 | Grad Max: 0.010771
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001367 | Grad Max: 0.003793
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.041595 | Grad Max: 0.041595
[GRADIENT NORM TOTAL] 18.3682

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037923  0.49620768] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.091
[MASKS] A(Pass/Fail): 717/1331 | B: 674/1374 | C: 703/1345
[LOSS Ex1] A: 0.62126 | B: 0.59966 | C: 0.60240
[LOGITS Ex2 A] Mean Abs: 2.116 | Max: 6.938
[LOSS Ex2] A: 0.10311 | B: 0.30863 | C: 0.21883
** [JOINT LOSS] ** : 0.817963
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009300 | Grad Max: 0.291152
  -> Layer: shared_layers.0.bias | Grad Mean: 0.848170 | Grad Max: 3.869596
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.005615
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000503 | Grad Max: 0.000503
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005289 | Grad Max: 0.790862
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.098286 | Grad Max: 4.446589
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.015500
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.057450 | Grad Max: 0.283179
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000865
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011701 | Grad Max: 0.022888
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000764
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003075 | Grad Max: 0.010796
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001298 | Grad Max: 0.003892
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039861 | Grad Max: 0.039861
[GRADIENT NORM TOTAL] 18.1098

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122495  0.48775047] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 626/1230 | C: 709/1339
[LOSS Ex1] A: 0.62021 | B: 0.60360 | C: 0.59879
[LOGITS Ex2 A] Mean Abs: 2.079 | Max: 6.779
[LOSS Ex2] A: 0.11144 | B: 0.32241 | C: 0.20846
** [JOINT LOSS] ** : 0.821636
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008192 | Grad Max: 0.285441
  -> Layer: shared_layers.0.bias | Grad Mean: 0.829060 | Grad Max: 3.807547
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.006497
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006788 | Grad Max: 0.006788
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005048 | Grad Max: 0.689899
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.094538 | Grad Max: 3.864976
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000515 | Grad Max: 0.016663
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.055458 | Grad Max: 0.291885
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000824
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011077 | Grad Max: 0.022496
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000697
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002896 | Grad Max: 0.009776
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001208 | Grad Max: 0.003376
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037374 | Grad Max: 0.037374
[GRADIENT NORM TOTAL] 17.6026

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50040215 0.4995978 ] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 676/1372 | C: 714/1334
[LOSS Ex1] A: 0.62925 | B: 0.60391 | C: 0.59772
[LOGITS Ex2 A] Mean Abs: 2.071 | Max: 5.785
[LOSS Ex2] A: 0.10318 | B: 0.33035 | C: 0.19491
** [JOINT LOSS] ** : 0.819774
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008546 | Grad Max: 0.312621
  -> Layer: shared_layers.0.bias | Grad Mean: 0.871027 | Grad Max: 4.273620
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.005325
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002484 | Grad Max: 0.002484
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005442 | Grad Max: 0.820869
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.101493 | Grad Max: 4.609174
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000533 | Grad Max: 0.016019
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.057570 | Grad Max: 0.288280
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000766
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011612 | Grad Max: 0.022363
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000620
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.003094 | Grad Max: 0.009891
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001299 | Grad Max: 0.003713
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.040608 | Grad Max: 0.040608
[GRADIENT NORM TOTAL] 19.1444

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409538  0.45904616] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 665/1383 | C: 747/1301
[LOSS Ex1] A: 0.63014 | B: 0.60415 | C: 0.59534
[LOGITS Ex2 A] Mean Abs: 2.039 | Max: 6.163
[LOSS Ex2] A: 0.10896 | B: 0.32510 | C: 0.21258
** [JOINT LOSS] ** : 0.825425
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008352 | Grad Max: 0.288658
  -> Layer: shared_layers.0.bias | Grad Mean: 0.829275 | Grad Max: 3.973768
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.005275
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.008060 | Grad Max: 0.008060
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005094 | Grad Max: 0.806759
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.095079 | Grad Max: 4.530701
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000508 | Grad Max: 0.015907
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.054591 | Grad Max: 0.276326
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000731
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010918 | Grad Max: 0.021503
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000563
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002869 | Grad Max: 0.009161
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001231 | Grad Max: 0.003461
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037842 | Grad Max: 0.037842
[GRADIENT NORM TOTAL] 18.0823

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8331871  0.16681288] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 674/1374 | C: 723/1325
[LOSS Ex1] A: 0.62344 | B: 0.59966 | C: 0.60588
[LOGITS Ex2 A] Mean Abs: 2.086 | Max: 6.437
[LOSS Ex2] A: 0.09875 | B: 0.30820 | C: 0.21895
** [JOINT LOSS] ** : 0.818297
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008530 | Grad Max: 0.274018
  -> Layer: shared_layers.0.bias | Grad Mean: 0.814689 | Grad Max: 3.695608
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.005475
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003419 | Grad Max: 0.003419
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005088 | Grad Max: 0.805335
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.094988 | Grad Max: 4.501594
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000511 | Grad Max: 0.015873
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.054908 | Grad Max: 0.278737
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000760
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011074 | Grad Max: 0.021765
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000651
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002915 | Grad Max: 0.009887
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001239 | Grad Max: 0.003661
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037881 | Grad Max: 0.037881
[GRADIENT NORM TOTAL] 17.7029

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50107956 0.49892047] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 626/1230 | C: 729/1319
[LOSS Ex1] A: 0.63079 | B: 0.60360 | C: 0.59939
[LOGITS Ex2 A] Mean Abs: 2.095 | Max: 6.603
[LOSS Ex2] A: 0.09596 | B: 0.31453 | C: 0.21398
** [JOINT LOSS] ** : 0.819419
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008460 | Grad Max: 0.290586
  -> Layer: shared_layers.0.bias | Grad Mean: 0.808912 | Grad Max: 3.758618
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002048 | Grad Max: 0.005221
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001659 | Grad Max: 0.001659
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005052 | Grad Max: 0.753139
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.093306 | Grad Max: 4.218432
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000507 | Grad Max: 0.015307
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.054727 | Grad Max: 0.271966
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000813
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011210 | Grad Max: 0.021969
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000681
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002964 | Grad Max: 0.010438
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001258 | Grad Max: 0.003798
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039194 | Grad Max: 0.039194
[GRADIENT NORM TOTAL] 17.3634

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7557099  0.24429008] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 676/1372 | C: 730/1318
[LOSS Ex1] A: 0.62568 | B: 0.60391 | C: 0.59883
[LOGITS Ex2 A] Mean Abs: 2.084 | Max: 6.462
[LOSS Ex2] A: 0.11144 | B: 0.33193 | C: 0.21491
** [JOINT LOSS] ** : 0.828901
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008730 | Grad Max: 0.318571
  -> Layer: shared_layers.0.bias | Grad Mean: 0.851008 | Grad Max: 4.273470
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.005786
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002484 | Grad Max: 0.002484
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005274 | Grad Max: 0.865488
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.097706 | Grad Max: 4.879166
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000509 | Grad Max: 0.015251
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.054767 | Grad Max: 0.275155
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000798
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011085 | Grad Max: 0.022131
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000612
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002942 | Grad Max: 0.009357
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001268 | Grad Max: 0.003508
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039440 | Grad Max: 0.039440
[GRADIENT NORM TOTAL] 18.5758

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482913 0.3517087] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 588/1028 | B: 665/1383 | C: 751/1297
[LOSS Ex1] A: 0.62377 | B: 0.60415 | C: 0.59902
[LOGITS Ex2 A] Mean Abs: 2.141 | Max: 10.134
[LOSS Ex2] A: 0.10499 | B: 0.32478 | C: 0.21171
** [JOINT LOSS] ** : 0.822808
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008795 | Grad Max: 0.281615
  -> Layer: shared_layers.0.bias | Grad Mean: 0.828497 | Grad Max: 3.770032
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.005310
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007929 | Grad Max: 0.007929
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005142 | Grad Max: 0.853388
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.095711 | Grad Max: 4.786537
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000519 | Grad Max: 0.015904
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.055882 | Grad Max: 0.281425
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000776
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011304 | Grad Max: 0.022022
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000635
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002992 | Grad Max: 0.010270
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001277 | Grad Max: 0.003782
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039679 | Grad Max: 0.039679
[GRADIENT NORM TOTAL] 17.9835

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50850946 0.49149057] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 674/1374 | C: 696/1352
[LOSS Ex1] A: 0.62457 | B: 0.59966 | C: 0.60203
[LOGITS Ex2 A] Mean Abs: 2.133 | Max: 9.391
[LOSS Ex2] A: 0.10218 | B: 0.30353 | C: 0.19787
** [JOINT LOSS] ** : 0.809949
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007977 | Grad Max: 0.273017
  -> Layer: shared_layers.0.bias | Grad Mean: 0.773398 | Grad Max: 3.735823
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.005354
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002590 | Grad Max: 0.002590
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004803 | Grad Max: 0.808509
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.088995 | Grad Max: 4.536067
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000476 | Grad Max: 0.014705
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.051467 | Grad Max: 0.267136
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000729
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010481 | Grad Max: 0.021184
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000637
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002751 | Grad Max: 0.009641
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001155 | Grad Max: 0.003579
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035716 | Grad Max: 0.035716
[GRADIENT NORM TOTAL] 16.9233

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037916 0.4962085] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.091
[MASKS] A(Pass/Fail): 717/1331 | B: 626/1230 | C: 477/899
[LOSS Ex1] A: 0.62126 | B: 0.60360 | C: 0.60310
[LOGITS Ex2 A] Mean Abs: 2.119 | Max: 6.424
[LOSS Ex2] A: 0.10003 | B: 0.31771 | C: 0.20624
** [JOINT LOSS] ** : 0.817310
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007357 | Grad Max: 0.251871
  -> Layer: shared_layers.0.bias | Grad Mean: 0.733685 | Grad Max: 3.456705
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.006051
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003288 | Grad Max: 0.003288
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004507 | Grad Max: 0.676047
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.083655 | Grad Max: 3.797083
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000446 | Grad Max: 0.013856
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047874 | Grad Max: 0.241111
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000711
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009601 | Grad Max: 0.020032
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000628
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002490 | Grad Max: 0.009334
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001061 | Grad Max: 0.003363
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032292 | Grad Max: 0.032292
[GRADIENT NORM TOTAL] 15.7055

[EPOCH SUMMARY] Train Loss: 0.8215

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8088 | Alpha: 0.5500
No improve count: 11/15

############################## EPOCH 185/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122498  0.48775017] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 676/1372 | C: 719/1329
[LOSS Ex1] A: 0.62021 | B: 0.60391 | C: 0.60558
[LOGITS Ex2 A] Mean Abs: 2.094 | Max: 5.878
[LOSS Ex2] A: 0.10780 | B: 0.32386 | C: 0.23254
** [JOINT LOSS] ** : 0.831298
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006808 | Grad Max: 0.312183
  -> Layer: shared_layers.0.bias | Grad Mean: 0.790958 | Grad Max: 4.159384
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005798
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002165 | Grad Max: 0.002165
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004799 | Grad Max: 0.760637
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.089465 | Grad Max: 4.256280
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.014577
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.050402 | Grad Max: 0.255227
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000727
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010122 | Grad Max: 0.019404
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000593
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002715 | Grad Max: 0.008987
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001164 | Grad Max: 0.003221
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036725 | Grad Max: 0.036725
[GRADIENT NORM TOTAL] 17.3347

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004022  0.49959773] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 665/1383 | C: 776/1272
[LOSS Ex1] A: 0.62925 | B: 0.60415 | C: 0.58821
[LOGITS Ex2 A] Mean Abs: 2.052 | Max: 5.639
[LOSS Ex2] A: 0.10262 | B: 0.32421 | C: 0.22311
** [JOINT LOSS] ** : 0.823851
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007745 | Grad Max: 0.282745
  -> Layer: shared_layers.0.bias | Grad Mean: 0.773711 | Grad Max: 3.771061
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.005248
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005704 | Grad Max: 0.005704
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004791 | Grad Max: 0.762482
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.089488 | Grad Max: 4.248768
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000477 | Grad Max: 0.014458
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.051304 | Grad Max: 0.265203
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000694
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010357 | Grad Max: 0.019605
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000611
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002752 | Grad Max: 0.009382
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001194 | Grad Max: 0.003360
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.036924 | Grad Max: 0.036924
[GRADIENT NORM TOTAL] 16.8947

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54095274 0.45904723] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 674/1374 | C: 697/1351
[LOSS Ex1] A: 0.63014 | B: 0.59966 | C: 0.60165
[LOGITS Ex2 A] Mean Abs: 2.042 | Max: 6.701
[LOSS Ex2] A: 0.10991 | B: 0.30974 | C: 0.22223
** [JOINT LOSS] ** : 0.824441
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009390 | Grad Max: 0.287333
  -> Layer: shared_layers.0.bias | Grad Mean: 0.813973 | Grad Max: 3.720760
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.005975
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.012337 | Grad Max: 0.012337
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005069 | Grad Max: 0.793755
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.094080 | Grad Max: 4.473201
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000511 | Grad Max: 0.015251
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.054856 | Grad Max: 0.279427
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000769
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011168 | Grad Max: 0.021944
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000591
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002989 | Grad Max: 0.009686
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001283 | Grad Max: 0.003601
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.039548 | Grad Max: 0.039548
[GRADIENT NORM TOTAL] 17.4058

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.83318686 0.1668131 ] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 626/1230 | C: 749/1299
[LOSS Ex1] A: 0.62344 | B: 0.60360 | C: 0.59908
[LOGITS Ex2 A] Mean Abs: 2.109 | Max: 7.825
[LOSS Ex2] A: 0.09484 | B: 0.31478 | C: 0.22116
** [JOINT LOSS] ** : 0.818966
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007258 | Grad Max: 0.271672
  -> Layer: shared_layers.0.bias | Grad Mean: 0.764688 | Grad Max: 3.522353
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.005682
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002555 | Grad Max: 0.002556
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004730 | Grad Max: 0.682430
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.088011 | Grad Max: 3.790436
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000476 | Grad Max: 0.014875
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.051790 | Grad Max: 0.265771
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000779
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010506 | Grad Max: 0.021811
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000651
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002776 | Grad Max: 0.009372
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001142 | Grad Max: 0.003208
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035833 | Grad Max: 0.035833
[GRADIENT NORM TOTAL] 16.3728

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010797 0.4989203] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 676/1372 | C: 709/1339
[LOSS Ex1] A: 0.63079 | B: 0.60391 | C: 0.59923
[LOGITS Ex2 A] Mean Abs: 2.112 | Max: 6.300
[LOSS Ex2] A: 0.09791 | B: 0.33356 | C: 0.21350
** [JOINT LOSS] ** : 0.826302
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007558 | Grad Max: 0.293265
  -> Layer: shared_layers.0.bias | Grad Mean: 0.778919 | Grad Max: 3.903029
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.005099
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002803 | Grad Max: 0.002803
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004835 | Grad Max: 0.792147
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.089235 | Grad Max: 4.426651
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000461 | Grad Max: 0.013731
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.049858 | Grad Max: 0.251145
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000718
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010166 | Grad Max: 0.020211
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000615
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002685 | Grad Max: 0.009240
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001134 | Grad Max: 0.003295
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035372 | Grad Max: 0.035372
[GRADIENT NORM TOTAL] 17.0767

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7557093  0.24429072] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 665/1383 | C: 754/1294
[LOSS Ex1] A: 0.62568 | B: 0.60415 | C: 0.59544
[LOGITS Ex2 A] Mean Abs: 2.071 | Max: 6.867
[LOSS Ex2] A: 0.11905 | B: 0.31919 | C: 0.20009
** [JOINT LOSS] ** : 0.821202
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009219 | Grad Max: 0.258299
  -> Layer: shared_layers.0.bias | Grad Mean: 0.761532 | Grad Max: 3.587745
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005662
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001431 | Grad Max: 0.001431
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004714 | Grad Max: 0.731339
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.087542 | Grad Max: 4.125095
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.013803
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.050821 | Grad Max: 0.247780
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000783
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010308 | Grad Max: 0.020624
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000566
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002718 | Grad Max: 0.009328
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001161 | Grad Max: 0.003528
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035307 | Grad Max: 0.035307
[GRADIENT NORM TOTAL] 16.1018

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482901  0.35170993] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 588/1028 | B: 674/1374 | C: 722/1326
[LOSS Ex1] A: 0.62377 | B: 0.59966 | C: 0.60018
[LOGITS Ex2 A] Mean Abs: 2.124 | Max: 7.735
[LOSS Ex2] A: 0.10622 | B: 0.30795 | C: 0.20637
** [JOINT LOSS] ** : 0.814718
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008261 | Grad Max: 0.245013
  -> Layer: shared_layers.0.bias | Grad Mean: 0.729964 | Grad Max: 3.275234
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005598
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006041 | Grad Max: 0.006041
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004576 | Grad Max: 0.759522
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.084883 | Grad Max: 4.291815
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.014621
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048992 | Grad Max: 0.250512
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000750
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009937 | Grad Max: 0.019728
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000621
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002601 | Grad Max: 0.009423
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001146 | Grad Max: 0.003664
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034387 | Grad Max: 0.034387
[GRADIENT NORM TOTAL] 15.7682

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085093  0.49149072] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 626/1230 | C: 695/1353
[LOSS Ex1] A: 0.62457 | B: 0.60360 | C: 0.60210
[LOGITS Ex2 A] Mean Abs: 2.132 | Max: 7.685
[LOSS Ex2] A: 0.09767 | B: 0.31978 | C: 0.20229
** [JOINT LOSS] ** : 0.816673
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009500 | Grad Max: 0.273171
  -> Layer: shared_layers.0.bias | Grad Mean: 0.821105 | Grad Max: 3.524477
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.005921
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000284 | Grad Max: 0.000284
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005049 | Grad Max: 0.723963
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.093820 | Grad Max: 4.061149
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000527 | Grad Max: 0.016224
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.056287 | Grad Max: 0.282230
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000885
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.011419 | Grad Max: 0.022761
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000662
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002992 | Grad Max: 0.010491
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001228 | Grad Max: 0.003546
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038095 | Grad Max: 0.038095
[GRADIENT NORM TOTAL] 17.1257

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037908 0.4962092] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.091
[MASKS] A(Pass/Fail): 717/1331 | B: 676/1372 | C: 712/1336
[LOSS Ex1] A: 0.62125 | B: 0.60391 | C: 0.60011
[LOGITS Ex2 A] Mean Abs: 2.121 | Max: 6.767
[LOSS Ex2] A: 0.10073 | B: 0.33249 | C: 0.20704
** [JOINT LOSS] ** : 0.821842
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008712 | Grad Max: 0.297094
  -> Layer: shared_layers.0.bias | Grad Mean: 0.828636 | Grad Max: 4.018713
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.006347
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003052 | Grad Max: 0.003052
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.005138 | Grad Max: 0.804664
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.095404 | Grad Max: 4.533433
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000500 | Grad Max: 0.014998
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.053902 | Grad Max: 0.275727
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000711
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010943 | Grad Max: 0.021223
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000589
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002910 | Grad Max: 0.009261
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001253 | Grad Max: 0.003452
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.038724 | Grad Max: 0.038724
[GRADIENT NORM TOTAL] 17.8904

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122501  0.48774984] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 665/1383 | C: 710/1338
[LOSS Ex1] A: 0.62021 | B: 0.60415 | C: 0.60108
[LOGITS Ex2 A] Mean Abs: 2.081 | Max: 6.137
[LOSS Ex2] A: 0.10803 | B: 0.31987 | C: 0.20459
** [JOINT LOSS] ** : 0.819309
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007624 | Grad Max: 0.254914
  -> Layer: shared_layers.0.bias | Grad Mean: 0.724261 | Grad Max: 3.410763
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.005813
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002484 | Grad Max: 0.002484
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004415 | Grad Max: 0.752354
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.082261 | Grad Max: 4.208348
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000437 | Grad Max: 0.013687
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047284 | Grad Max: 0.239631
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000657
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009521 | Grad Max: 0.018448
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000629
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002503 | Grad Max: 0.008817
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001095 | Grad Max: 0.003376
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033443 | Grad Max: 0.033443
[GRADIENT NORM TOTAL] 15.6458

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50040233 0.49959767] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 674/1374 | C: 721/1327
[LOSS Ex1] A: 0.62925 | B: 0.59966 | C: 0.60220
[LOGITS Ex2 A] Mean Abs: 2.070 | Max: 5.431
[LOSS Ex2] A: 0.10096 | B: 0.30666 | C: 0.20230
** [JOINT LOSS] ** : 0.813681
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007079 | Grad Max: 0.255372
  -> Layer: shared_layers.0.bias | Grad Mean: 0.722825 | Grad Max: 3.331891
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002083 | Grad Max: 0.004967
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001774 | Grad Max: 0.001774
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004530 | Grad Max: 0.795149
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.084543 | Grad Max: 4.447441
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.013101
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.049379 | Grad Max: 0.244325
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000737
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009887 | Grad Max: 0.020467
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000615
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002574 | Grad Max: 0.009084
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001072 | Grad Max: 0.003382
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032941 | Grad Max: 0.032941
[GRADIENT NORM TOTAL] 15.7244

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409516  0.45904836] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 626/1230 | C: 732/1316
[LOSS Ex1] A: 0.63014 | B: 0.60360 | C: 0.60212
[LOGITS Ex2 A] Mean Abs: 2.058 | Max: 5.998
[LOSS Ex2] A: 0.10184 | B: 0.31149 | C: 0.20222
** [JOINT LOSS] ** : 0.817138
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007920 | Grad Max: 0.220602
  -> Layer: shared_layers.0.bias | Grad Mean: 0.660516 | Grad Max: 3.020899
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.005030
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003595 | Grad Max: 0.003595
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004167 | Grad Max: 0.640904
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.077184 | Grad Max: 3.613054
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000422 | Grad Max: 0.014169
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.044975 | Grad Max: 0.237181
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000642
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009136 | Grad Max: 0.018177
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000515
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002396 | Grad Max: 0.008140
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001042 | Grad Max: 0.003304
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031544 | Grad Max: 0.031544
[GRADIENT NORM TOTAL] 14.2374

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8331866  0.16681337] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 676/1372 | C: 731/1317
[LOSS Ex1] A: 0.62344 | B: 0.60391 | C: 0.59132
[LOGITS Ex2 A] Mean Abs: 2.103 | Max: 6.394
[LOSS Ex2] A: 0.09649 | B: 0.32173 | C: 0.20583
** [JOINT LOSS] ** : 0.814239
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005888 | Grad Max: 0.281947
  -> Layer: shared_layers.0.bias | Grad Mean: 0.733430 | Grad Max: 3.798266
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.005572
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001524 | Grad Max: 0.001524
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004483 | Grad Max: 0.736988
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.083594 | Grad Max: 4.103831
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000437 | Grad Max: 0.014272
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047682 | Grad Max: 0.247768
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000695
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009581 | Grad Max: 0.019274
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000563
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002537 | Grad Max: 0.008616
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001062 | Grad Max: 0.003367
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033390 | Grad Max: 0.033390
[GRADIENT NORM TOTAL] 16.0634

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50107986 0.49892014] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 665/1383 | C: 509/867
[LOSS Ex1] A: 0.63079 | B: 0.60415 | C: 0.59742
[LOGITS Ex2 A] Mean Abs: 2.128 | Max: 5.641
[LOSS Ex2] A: 0.09308 | B: 0.31781 | C: 0.20826
** [JOINT LOSS] ** : 0.817171
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006804 | Grad Max: 0.266654
  -> Layer: shared_layers.0.bias | Grad Mean: 0.716886 | Grad Max: 3.498542
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.005027
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004928 | Grad Max: 0.004928
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004391 | Grad Max: 0.758154
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.081585 | Grad Max: 4.219219
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000437 | Grad Max: 0.013272
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047455 | Grad Max: 0.245340
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000667
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009586 | Grad Max: 0.018624
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000577
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002543 | Grad Max: 0.008677
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001075 | Grad Max: 0.003351
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033698 | Grad Max: 0.033698
[GRADIENT NORM TOTAL] 15.4922

[EPOCH SUMMARY] Train Loss: 0.8201

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8072 | Alpha: 0.5500
No improve count: 12/15

############################## EPOCH 186/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75570863 0.24429138] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 674/1374 | C: 752/1296
[LOSS Ex1] A: 0.62568 | B: 0.59966 | C: 0.59366
[LOGITS Ex2 A] Mean Abs: 2.089 | Max: 6.209
[LOSS Ex2] A: 0.11229 | B: 0.30664 | C: 0.20085
** [JOINT LOSS] ** : 0.812928
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.009215 | Grad Max: 0.249093
  -> Layer: shared_layers.0.bias | Grad Mean: 0.736755 | Grad Max: 3.421594
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.005903
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009135 | Grad Max: 0.009135
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004546 | Grad Max: 0.743008
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.083871 | Grad Max: 4.173026
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000456 | Grad Max: 0.014493
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.048610 | Grad Max: 0.237567
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000693
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009888 | Grad Max: 0.019974
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000583
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002570 | Grad Max: 0.009345
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001092 | Grad Max: 0.003542
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033085 | Grad Max: 0.033085
[GRADIENT NORM TOTAL] 15.5858

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64828885 0.35171115] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 588/1028 | B: 626/1230 | C: 721/1327
[LOSS Ex1] A: 0.62377 | B: 0.60360 | C: 0.60265
[LOGITS Ex2 A] Mean Abs: 2.147 | Max: 8.310
[LOSS Ex2] A: 0.10239 | B: 0.30688 | C: 0.22714
** [JOINT LOSS] ** : 0.822143
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008234 | Grad Max: 0.248137
  -> Layer: shared_layers.0.bias | Grad Mean: 0.731031 | Grad Max: 3.140438
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.006046
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000535 | Grad Max: 0.000535
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004490 | Grad Max: 0.679851
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.083414 | Grad Max: 3.803739
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000470 | Grad Max: 0.014205
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.050577 | Grad Max: 0.248081
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000770
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010260 | Grad Max: 0.019918
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000609
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002716 | Grad Max: 0.009604
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001159 | Grad Max: 0.003477
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035926 | Grad Max: 0.035926
[GRADIENT NORM TOTAL] 15.3138

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50850916 0.49149087] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 676/1372 | C: 715/1333
[LOSS Ex1] A: 0.62457 | B: 0.60391 | C: 0.60173
[LOGITS Ex2 A] Mean Abs: 2.125 | Max: 7.692
[LOSS Ex2] A: 0.09388 | B: 0.32933 | C: 0.21434
** [JOINT LOSS] ** : 0.822588
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008552 | Grad Max: 0.287059
  -> Layer: shared_layers.0.bias | Grad Mean: 0.777928 | Grad Max: 3.786120
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.005683
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000021 | Grad Max: 0.000021
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004813 | Grad Max: 0.833840
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.088839 | Grad Max: 4.678837
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000473 | Grad Max: 0.013296
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.050716 | Grad Max: 0.243041
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000706
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010369 | Grad Max: 0.019796
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000578
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002796 | Grad Max: 0.008769
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001206 | Grad Max: 0.003428
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.037824 | Grad Max: 0.037824
[GRADIENT NORM TOTAL] 16.9204

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50379    0.49620998] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.091
[MASKS] A(Pass/Fail): 717/1331 | B: 665/1383 | C: 712/1336
[LOSS Ex1] A: 0.62125 | B: 0.60415 | C: 0.59836
[LOGITS Ex2 A] Mean Abs: 2.113 | Max: 5.921
[LOSS Ex2] A: 0.10175 | B: 0.32991 | C: 0.21115
** [JOINT LOSS] ** : 0.822193
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007644 | Grad Max: 0.236983
  -> Layer: shared_layers.0.bias | Grad Mean: 0.701715 | Grad Max: 3.239472
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.006021
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002351 | Grad Max: 0.002351
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004315 | Grad Max: 0.672996
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.080252 | Grad Max: 3.783895
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000430 | Grad Max: 0.012625
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.046359 | Grad Max: 0.224851
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000689
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009326 | Grad Max: 0.018756
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000549
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002442 | Grad Max: 0.008557
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001057 | Grad Max: 0.003444
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032255 | Grad Max: 0.032255
[GRADIENT NORM TOTAL] 14.9737

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122504  0.48774958] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 674/1374 | C: 744/1304
[LOSS Ex1] A: 0.62021 | B: 0.59966 | C: 0.59622
[LOGITS Ex2 A] Mean Abs: 2.096 | Max: 7.004
[LOSS Ex2] A: 0.11645 | B: 0.29883 | C: 0.20720
** [JOINT LOSS] ** : 0.812855
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007275 | Grad Max: 0.243145
  -> Layer: shared_layers.0.bias | Grad Mean: 0.680541 | Grad Max: 3.195089
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002217 | Grad Max: 0.005715
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002225 | Grad Max: 0.002225
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004151 | Grad Max: 0.690252
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.077438 | Grad Max: 3.873073
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000414 | Grad Max: 0.012629
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.044472 | Grad Max: 0.227292
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000655
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008908 | Grad Max: 0.018502
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000550
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002356 | Grad Max: 0.008177
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001037 | Grad Max: 0.003300
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031408 | Grad Max: 0.031408
[GRADIENT NORM TOTAL] 14.4741

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004024 0.4995976] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 626/1230 | C: 745/1303
[LOSS Ex1] A: 0.62925 | B: 0.60360 | C: 0.59481
[LOGITS Ex2 A] Mean Abs: 2.080 | Max: 5.138
[LOSS Ex2] A: 0.10065 | B: 0.30494 | C: 0.22550
** [JOINT LOSS] ** : 0.819583
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007313 | Grad Max: 0.254235
  -> Layer: shared_layers.0.bias | Grad Mean: 0.704891 | Grad Max: 3.300234
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.005218
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004288 | Grad Max: 0.004288
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004445 | Grad Max: 0.721949
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.082187 | Grad Max: 4.053629
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000441 | Grad Max: 0.013652
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047449 | Grad Max: 0.235260
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000722
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009621 | Grad Max: 0.018996
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000566
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002511 | Grad Max: 0.008592
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001064 | Grad Max: 0.003227
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.033060 | Grad Max: 0.033060
[GRADIENT NORM TOTAL] 15.3040

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409505 0.4590495] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 676/1372 | C: 762/1286
[LOSS Ex1] A: 0.63014 | B: 0.60391 | C: 0.59910
[LOGITS Ex2 A] Mean Abs: 2.051 | Max: 5.663
[LOSS Ex2] A: 0.10653 | B: 0.32336 | C: 0.20726
** [JOINT LOSS] ** : 0.823429
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.008152 | Grad Max: 0.271708
  -> Layer: shared_layers.0.bias | Grad Mean: 0.754435 | Grad Max: 3.694061
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.005473
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006073 | Grad Max: 0.006073
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004744 | Grad Max: 0.784414
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.088367 | Grad Max: 4.416069
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000468 | Grad Max: 0.013953
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.050398 | Grad Max: 0.257832
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000710
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010179 | Grad Max: 0.020912
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000567
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002677 | Grad Max: 0.009195
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001135 | Grad Max: 0.003615
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034872 | Grad Max: 0.034872
[GRADIENT NORM TOTAL] 16.4829

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.83318645 0.16681357] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 665/1383 | C: 736/1312
[LOSS Ex1] A: 0.62344 | B: 0.60415 | C: 0.59299
[LOGITS Ex2 A] Mean Abs: 2.116 | Max: 7.133
[LOSS Ex2] A: 0.09796 | B: 0.31623 | C: 0.19974
** [JOINT LOSS] ** : 0.811505
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005782 | Grad Max: 0.249808
  -> Layer: shared_layers.0.bias | Grad Mean: 0.684816 | Grad Max: 3.311555
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.005693
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004251 | Grad Max: 0.004251
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004136 | Grad Max: 0.671266
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.077172 | Grad Max: 3.748067
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000414 | Grad Max: 0.014116
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.045202 | Grad Max: 0.237946
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000720
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009140 | Grad Max: 0.020477
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000516
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002396 | Grad Max: 0.008465
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001001 | Grad Max: 0.003340
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031414 | Grad Max: 0.031414
[GRADIENT NORM TOTAL] 14.6074

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50108004 0.49891996] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 674/1374 | C: 713/1335
[LOSS Ex1] A: 0.63079 | B: 0.59966 | C: 0.60383
[LOGITS Ex2 A] Mean Abs: 2.120 | Max: 6.693
[LOSS Ex2] A: 0.09231 | B: 0.29991 | C: 0.21015
** [JOINT LOSS] ** : 0.812217
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006832 | Grad Max: 0.204160
  -> Layer: shared_layers.0.bias | Grad Mean: 0.580316 | Grad Max: 2.756455
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.005001
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004024 | Grad Max: 0.004024
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003657 | Grad Max: 0.593210
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.066935 | Grad Max: 3.295705
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000356 | Grad Max: 0.010587
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037935 | Grad Max: 0.180622
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000615
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007784 | Grad Max: 0.016342
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000474
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002025 | Grad Max: 0.007103
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000875 | Grad Max: 0.003072
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026428 | Grad Max: 0.026428
[GRADIENT NORM TOTAL] 12.5192

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7557081  0.24429193] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 626/1230 | C: 709/1339
[LOSS Ex1] A: 0.62568 | B: 0.60360 | C: 0.60208
[LOGITS Ex2 A] Mean Abs: 2.089 | Max: 7.077
[LOSS Ex2] A: 0.11557 | B: 0.30712 | C: 0.19348
** [JOINT LOSS] ** : 0.815846
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007316 | Grad Max: 0.222380
  -> Layer: shared_layers.0.bias | Grad Mean: 0.651993 | Grad Max: 2.974643
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.005507
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003965 | Grad Max: 0.003965
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004042 | Grad Max: 0.604047
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074818 | Grad Max: 3.407891
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.012224
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.043725 | Grad Max: 0.218726
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000664
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008825 | Grad Max: 0.017529
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000520
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002295 | Grad Max: 0.007880
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000958 | Grad Max: 0.003335
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029165 | Grad Max: 0.029165
[GRADIENT NORM TOTAL] 13.8109

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64828753 0.35171247] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 588/1028 | B: 676/1372 | C: 706/1342
[LOSS Ex1] A: 0.62377 | B: 0.60391 | C: 0.60284
[LOGITS Ex2 A] Mean Abs: 2.145 | Max: 8.353
[LOSS Ex2] A: 0.09809 | B: 0.32066 | C: 0.20931
** [JOINT LOSS] ** : 0.819525
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007240 | Grad Max: 0.266877
  -> Layer: shared_layers.0.bias | Grad Mean: 0.743254 | Grad Max: 3.640386
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.006329
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.014988 | Grad Max: 0.014988
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004628 | Grad Max: 0.775777
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.085550 | Grad Max: 4.353660
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000452 | Grad Max: 0.013784
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.049233 | Grad Max: 0.253784
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000726
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010011 | Grad Max: 0.020852
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000621
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002666 | Grad Max: 0.009049
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001125 | Grad Max: 0.003446
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035004 | Grad Max: 0.035004
[GRADIENT NORM TOTAL] 16.4500

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.508509 0.491491] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 665/1383 | C: 724/1324
[LOSS Ex1] A: 0.62457 | B: 0.60415 | C: 0.59969
[LOGITS Ex2 A] Mean Abs: 2.139 | Max: 7.236
[LOSS Ex2] A: 0.09652 | B: 0.32446 | C: 0.20367
** [JOINT LOSS] ** : 0.817688
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007322 | Grad Max: 0.237742
  -> Layer: shared_layers.0.bias | Grad Mean: 0.663928 | Grad Max: 3.201676
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.005558
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.003695 | Grad Max: 0.003695
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004076 | Grad Max: 0.715225
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.075610 | Grad Max: 4.019011
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000404 | Grad Max: 0.012044
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.043803 | Grad Max: 0.220571
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000644
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008913 | Grad Max: 0.017674
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000527
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002328 | Grad Max: 0.007940
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000985 | Grad Max: 0.002994
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.030500 | Grad Max: 0.030500
[GRADIENT NORM TOTAL] 14.2362

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50378925 0.49621078] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.091
[MASKS] A(Pass/Fail): 717/1331 | B: 674/1374 | C: 709/1339
[LOSS Ex1] A: 0.62125 | B: 0.59966 | C: 0.60115
[LOGITS Ex2 A] Mean Abs: 2.120 | Max: 6.887
[LOSS Ex2] A: 0.10576 | B: 0.29593 | C: 0.20663
** [JOINT LOSS] ** : 0.810133
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005275 | Grad Max: 0.213051
  -> Layer: shared_layers.0.bias | Grad Mean: 0.600426 | Grad Max: 2.848289
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.005792
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001222 | Grad Max: 0.001222
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003690 | Grad Max: 0.653376
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.068768 | Grad Max: 3.662163
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000365 | Grad Max: 0.011721
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.039951 | Grad Max: 0.217400
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000575
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007999 | Grad Max: 0.016165
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000440
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002082 | Grad Max: 0.006442
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000878 | Grad Max: 0.002832
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027127 | Grad Max: 0.027127
[GRADIENT NORM TOTAL] 13.0583

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122507  0.48774925] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 626/1230 | C: 488/888
[LOSS Ex1] A: 0.62021 | B: 0.60360 | C: 0.59620
[LOGITS Ex2 A] Mean Abs: 2.083 | Max: 6.342
[LOSS Ex2] A: 0.10303 | B: 0.30197 | C: 0.23641
** [JOINT LOSS] ** : 0.820472
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005335 | Grad Max: 0.246352
  -> Layer: shared_layers.0.bias | Grad Mean: 0.660678 | Grad Max: 3.122468
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.005989
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002978 | Grad Max: 0.002978
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003984 | Grad Max: 0.639311
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.074330 | Grad Max: 3.573451
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000406 | Grad Max: 0.013269
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.044282 | Grad Max: 0.211338
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000649
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008800 | Grad Max: 0.017618
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000513
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002339 | Grad Max: 0.007898
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000994 | Grad Max: 0.003097
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031421 | Grad Max: 0.031421
[GRADIENT NORM TOTAL] 14.1321

[EPOCH SUMMARY] Train Loss: 0.8174

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8045 | Alpha: 0.5500
No improve count: 13/15

############################## EPOCH 187/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50040245 0.49959752] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 676/1372 | C: 754/1294
[LOSS Ex1] A: 0.62925 | B: 0.60391 | C: 0.59522
[LOGITS Ex2 A] Mean Abs: 2.086 | Max: 5.697
[LOSS Ex2] A: 0.09964 | B: 0.32939 | C: 0.22413
** [JOINT LOSS] ** : 0.827183
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007300 | Grad Max: 0.264881
  -> Layer: shared_layers.0.bias | Grad Mean: 0.723565 | Grad Max: 3.551191
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.005518
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.006683 | Grad Max: 0.006683
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004551 | Grad Max: 0.751902
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.084384 | Grad Max: 4.219884
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000457 | Grad Max: 0.014493
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.049474 | Grad Max: 0.251089
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000739
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.010074 | Grad Max: 0.020373
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000569
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002656 | Grad Max: 0.008772
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001142 | Grad Max: 0.003258
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.035329 | Grad Max: 0.035329
[GRADIENT NORM TOTAL] 15.7346

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409494  0.45905066] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 665/1383 | C: 739/1309
[LOSS Ex1] A: 0.63013 | B: 0.60415 | C: 0.60167
[LOGITS Ex2 A] Mean Abs: 2.068 | Max: 5.705
[LOSS Ex2] A: 0.11074 | B: 0.32124 | C: 0.21821
** [JOINT LOSS] ** : 0.828712
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007747 | Grad Max: 0.241281
  -> Layer: shared_layers.0.bias | Grad Mean: 0.686280 | Grad Max: 3.121789
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001985 | Grad Max: 0.005350
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010715 | Grad Max: 0.010715
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004277 | Grad Max: 0.734749
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.079714 | Grad Max: 4.119785
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000439 | Grad Max: 0.012909
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.047084 | Grad Max: 0.234606
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000697
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.009546 | Grad Max: 0.018449
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000562
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002546 | Grad Max: 0.008901
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001095 | Grad Max: 0.003368
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.034013 | Grad Max: 0.034013
[GRADIENT NORM TOTAL] 14.7542

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8331862  0.16681375] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 674/1374 | C: 703/1345
[LOSS Ex1] A: 0.62344 | B: 0.59966 | C: 0.59788
[LOGITS Ex2 A] Mean Abs: 2.114 | Max: 7.152
[LOSS Ex2] A: 0.09897 | B: 0.29826 | C: 0.20539
** [JOINT LOSS] ** : 0.807868
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005243 | Grad Max: 0.190491
  -> Layer: shared_layers.0.bias | Grad Mean: 0.523630 | Grad Max: 2.552562
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002205 | Grad Max: 0.005483
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005098 | Grad Max: 0.005098
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003278 | Grad Max: 0.401644
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060591 | Grad Max: 2.244462
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000331 | Grad Max: 0.010133
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035415 | Grad Max: 0.176780
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000581
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007214 | Grad Max: 0.015019
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000496
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001877 | Grad Max: 0.006921
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000807 | Grad Max: 0.002931
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024755 | Grad Max: 0.024755
[GRADIENT NORM TOTAL] 11.1647

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010802 0.4989198] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 626/1230 | C: 730/1318
[LOSS Ex1] A: 0.63079 | B: 0.60360 | C: 0.59409
[LOGITS Ex2 A] Mean Abs: 2.123 | Max: 6.507
[LOSS Ex2] A: 0.08839 | B: 0.30794 | C: 0.21152
** [JOINT LOSS] ** : 0.812109
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007000 | Grad Max: 0.205359
  -> Layer: shared_layers.0.bias | Grad Mean: 0.602296 | Grad Max: 2.669979
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.005491
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000567 | Grad Max: 0.000567
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003788 | Grad Max: 0.423704
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070069 | Grad Max: 2.379806
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000399 | Grad Max: 0.012963
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.042723 | Grad Max: 0.222214
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000667
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008697 | Grad Max: 0.017987
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000600
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002273 | Grad Max: 0.008517
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000940 | Grad Max: 0.003195
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029250 | Grad Max: 0.029250
[GRADIENT NORM TOTAL] 12.4675

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.75570756 0.24429241] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 676/1372 | C: 739/1309
[LOSS Ex1] A: 0.62568 | B: 0.60391 | C: 0.60101
[LOGITS Ex2 A] Mean Abs: 2.095 | Max: 7.248
[LOSS Ex2] A: 0.10985 | B: 0.32379 | C: 0.20577
** [JOINT LOSS] ** : 0.823335
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006824 | Grad Max: 0.220128
  -> Layer: shared_layers.0.bias | Grad Mean: 0.609092 | Grad Max: 2.872821
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.005753
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000791 | Grad Max: 0.000791
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003818 | Grad Max: 0.497031
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.070667 | Grad Max: 2.814669
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000379 | Grad Max: 0.012022
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.040772 | Grad Max: 0.222001
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000604
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008220 | Grad Max: 0.016178
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000489
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002176 | Grad Max: 0.007657
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000926 | Grad Max: 0.003001
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028641 | Grad Max: 0.028641
[GRADIENT NORM TOTAL] 13.0604

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.64828634 0.35171366] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 588/1028 | B: 665/1383 | C: 691/1357
[LOSS Ex1] A: 0.62377 | B: 0.60415 | C: 0.60331
[LOGITS Ex2 A] Mean Abs: 2.138 | Max: 10.286
[LOSS Ex2] A: 0.10287 | B: 0.32022 | C: 0.18909
** [JOINT LOSS] ** : 0.814472
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006498 | Grad Max: 0.185110
  -> Layer: shared_layers.0.bias | Grad Mean: 0.569787 | Grad Max: 2.529488
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.005526
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.001413 | Grad Max: 0.001413
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003526 | Grad Max: 0.504322
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065560 | Grad Max: 2.794327
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.011176
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.038309 | Grad Max: 0.206929
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000651
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007665 | Grad Max: 0.016801
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000474
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001974 | Grad Max: 0.007368
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000855 | Grad Max: 0.003187
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025451 | Grad Max: 0.025451
[GRADIENT NORM TOTAL] 12.0643

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50850886 0.49149117] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 674/1374 | C: 753/1295
[LOSS Ex1] A: 0.62457 | B: 0.59966 | C: 0.59584
[LOGITS Ex2 A] Mean Abs: 2.118 | Max: 8.322
[LOSS Ex2] A: 0.09255 | B: 0.29619 | C: 0.20587
** [JOINT LOSS] ** : 0.804894
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007387 | Grad Max: 0.193063
  -> Layer: shared_layers.0.bias | Grad Mean: 0.597565 | Grad Max: 2.635975
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002203 | Grad Max: 0.005884
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004431 | Grad Max: 0.004431
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003735 | Grad Max: 0.501850
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.069052 | Grad Max: 2.781194
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.012679
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.041077 | Grad Max: 0.206180
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000667
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008379 | Grad Max: 0.017251
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000502
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002166 | Grad Max: 0.007908
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000952 | Grad Max: 0.003337
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028343 | Grad Max: 0.028343
[GRADIENT NORM TOTAL] 12.4876

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037884  0.49621156] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.091
[MASKS] A(Pass/Fail): 717/1331 | B: 626/1230 | C: 711/1337
[LOSS Ex1] A: 0.62125 | B: 0.60360 | C: 0.60307
[LOGITS Ex2 A] Mean Abs: 2.123 | Max: 7.352
[LOSS Ex2] A: 0.10192 | B: 0.30456 | C: 0.21149
** [JOINT LOSS] ** : 0.815301
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005266 | Grad Max: 0.199132
  -> Layer: shared_layers.0.bias | Grad Mean: 0.580873 | Grad Max: 2.610063
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.006165
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004793 | Grad Max: 0.004793
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003638 | Grad Max: 0.483207
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.067513 | Grad Max: 2.680828
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000375 | Grad Max: 0.012403
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.040537 | Grad Max: 0.208722
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000599
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008162 | Grad Max: 0.016822
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000506
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002141 | Grad Max: 0.007557
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000927 | Grad Max: 0.003054
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028249 | Grad Max: 0.028249
[GRADIENT NORM TOTAL] 12.2954

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122511  0.48774892] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 676/1372 | C: 707/1341
[LOSS Ex1] A: 0.62021 | B: 0.60391 | C: 0.60799
[LOGITS Ex2 A] Mean Abs: 2.093 | Max: 7.241
[LOSS Ex2] A: 0.10672 | B: 0.31592 | C: 0.22788
** [JOINT LOSS] ** : 0.827541
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007031 | Grad Max: 0.238829
  -> Layer: shared_layers.0.bias | Grad Mean: 0.655340 | Grad Max: 2.972092
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.005457
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005935 | Grad Max: 0.005935
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.004018 | Grad Max: 0.550272
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.075057 | Grad Max: 3.025749
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000415 | Grad Max: 0.014744
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.044319 | Grad Max: 0.253060
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000687
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008868 | Grad Max: 0.018604
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000549
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002377 | Grad Max: 0.007745
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001046 | Grad Max: 0.003214
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.032096 | Grad Max: 0.032096
[GRADIENT NORM TOTAL] 13.6991

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50040257 0.49959743] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 665/1383 | C: 708/1340
[LOSS Ex1] A: 0.62925 | B: 0.60415 | C: 0.60111
[LOGITS Ex2 A] Mean Abs: 2.088 | Max: 6.026
[LOSS Ex2] A: 0.10074 | B: 0.30989 | C: 0.21404
** [JOINT LOSS] ** : 0.819728
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006873 | Grad Max: 0.221591
  -> Layer: shared_layers.0.bias | Grad Mean: 0.615363 | Grad Max: 2.788455
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.006019
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.010675 | Grad Max: 0.010675
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003868 | Grad Max: 0.507046
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.071831 | Grad Max: 2.787810
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000404 | Grad Max: 0.011243
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.043227 | Grad Max: 0.200154
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000643
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008802 | Grad Max: 0.017510
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000494
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002341 | Grad Max: 0.007526
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.001003 | Grad Max: 0.003234
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.031095 | Grad Max: 0.031095
[GRADIENT NORM TOTAL] 12.9683

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409482  0.45905182] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 674/1374 | C: 735/1313
[LOSS Ex1] A: 0.63013 | B: 0.59966 | C: 0.60384
[LOGITS Ex2 A] Mean Abs: 2.071 | Max: 5.945
[LOSS Ex2] A: 0.10455 | B: 0.29666 | C: 0.20657
** [JOINT LOSS] ** : 0.813806
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006130 | Grad Max: 0.166371
  -> Layer: shared_layers.0.bias | Grad Mean: 0.537527 | Grad Max: 2.254179
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002014 | Grad Max: 0.005388
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004914 | Grad Max: 0.004914
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003316 | Grad Max: 0.417892
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.061524 | Grad Max: 2.326216
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000344 | Grad Max: 0.010598
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037003 | Grad Max: 0.190425
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000589
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007431 | Grad Max: 0.015419
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000413
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001921 | Grad Max: 0.006258
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000823 | Grad Max: 0.002884
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024751 | Grad Max: 0.024751
[GRADIENT NORM TOTAL] 11.1796

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.83318603 0.16681394] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 626/1230 | C: 718/1330
[LOSS Ex1] A: 0.62344 | B: 0.60360 | C: 0.59464
[LOGITS Ex2 A] Mean Abs: 2.114 | Max: 7.713
[LOSS Ex2] A: 0.09695 | B: 0.29642 | C: 0.21170
** [JOINT LOSS] ** : 0.808918
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004194 | Grad Max: 0.184553
  -> Layer: shared_layers.0.bias | Grad Mean: 0.528907 | Grad Max: 2.452946
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.005982
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007160 | Grad Max: 0.007160
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003239 | Grad Max: 0.432457
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060326 | Grad Max: 2.373373
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.011167
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036330 | Grad Max: 0.192008
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000546
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007179 | Grad Max: 0.015017
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000453
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001842 | Grad Max: 0.007045
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000762 | Grad Max: 0.002906
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023441 | Grad Max: 0.023441
[GRADIENT NORM TOTAL] 11.1788

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50108045 0.49891958] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 676/1372 | C: 727/1321
[LOSS Ex1] A: 0.63079 | B: 0.60391 | C: 0.59329
[LOGITS Ex2 A] Mean Abs: 2.108 | Max: 6.596
[LOSS Ex2] A: 0.09495 | B: 0.32000 | C: 0.21359
** [JOINT LOSS] ** : 0.818838
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005528 | Grad Max: 0.220326
  -> Layer: shared_layers.0.bias | Grad Mean: 0.548176 | Grad Max: 2.900694
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005136
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000025 | Grad Max: 0.000025
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003452 | Grad Max: 0.526271
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.063315 | Grad Max: 2.898128
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000326 | Grad Max: 0.011181
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035536 | Grad Max: 0.193536
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000566
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007313 | Grad Max: 0.015642
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000446
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001948 | Grad Max: 0.007061
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000809 | Grad Max: 0.002902
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025550 | Grad Max: 0.025550
[GRADIENT NORM TOTAL] 12.0014

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.755707   0.24429302] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 665/1383 | C: 521/855
[LOSS Ex1] A: 0.62568 | B: 0.60415 | C: 0.59047
[LOGITS Ex2 A] Mean Abs: 2.089 | Max: 6.754
[LOSS Ex2] A: 0.11416 | B: 0.30940 | C: 0.20412
** [JOINT LOSS] ** : 0.815996
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007558 | Grad Max: 0.201804
  -> Layer: shared_layers.0.bias | Grad Mean: 0.590470 | Grad Max: 2.553873
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002091 | Grad Max: 0.005789
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.009872 | Grad Max: 0.009872
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003695 | Grad Max: 0.424773
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.068390 | Grad Max: 2.312513
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000393 | Grad Max: 0.013079
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.041873 | Grad Max: 0.227383
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000633
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008448 | Grad Max: 0.017121
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000506
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002226 | Grad Max: 0.007350
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000987 | Grad Max: 0.003176
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.029757 | Grad Max: 0.029757
[GRADIENT NORM TOTAL] 12.0736

[EPOCH SUMMARY] Train Loss: 0.8171

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8030 | Alpha: 0.5500
No improve count: 14/15

############################## EPOCH 188/500 START ##############################

>>> [TRAIN] BATCH 0 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482851  0.35171497] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 588/1028 | B: 674/1374 | C: 717/1331
[LOSS Ex1] A: 0.62377 | B: 0.59966 | C: 0.59829
[LOGITS Ex2 A] Mean Abs: 2.147 | Max: 8.416
[LOSS Ex2] A: 0.09621 | B: 0.29634 | C: 0.20370
** [JOINT LOSS] ** : 0.805992
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.007065 | Grad Max: 0.189758
  -> Layer: shared_layers.0.bias | Grad Mean: 0.565639 | Grad Max: 2.543622
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.005998
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.011180 | Grad Max: 0.011180
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003577 | Grad Max: 0.471538
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.066321 | Grad Max: 2.610998
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000378 | Grad Max: 0.011868
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.040288 | Grad Max: 0.203090
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000603
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.008188 | Grad Max: 0.016632
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000454
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002144 | Grad Max: 0.007260
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000944 | Grad Max: 0.003294
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.028444 | Grad Max: 0.028444
[GRADIENT NORM TOTAL] 11.9788

>>> [TRAIN] BATCH 1 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085088  0.49149123] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 626/1230 | C: 733/1315
[LOSS Ex1] A: 0.62457 | B: 0.60360 | C: 0.60227
[LOGITS Ex2 A] Mean Abs: 2.148 | Max: 8.874
[LOSS Ex2] A: 0.08544 | B: 0.30428 | C: 0.22550
** [JOINT LOSS] ** : 0.815224
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006319 | Grad Max: 0.181624
  -> Layer: shared_layers.0.bias | Grad Mean: 0.544914 | Grad Max: 2.342465
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.005563
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002376 | Grad Max: 0.002376
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003374 | Grad Max: 0.382795
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.062499 | Grad Max: 2.120625
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000351 | Grad Max: 0.010717
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.037690 | Grad Max: 0.194696
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000625
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007637 | Grad Max: 0.016299
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000437
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002002 | Grad Max: 0.006239
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000871 | Grad Max: 0.002774
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.026312 | Grad Max: 0.026312
[GRADIENT NORM TOTAL] 11.2040

>>> [TRAIN] BATCH 2 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037876  0.49621245] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.091
[MASKS] A(Pass/Fail): 717/1331 | B: 676/1372 | C: 717/1331
[LOSS Ex1] A: 0.62125 | B: 0.60390 | C: 0.59999
[LOGITS Ex2 A] Mean Abs: 2.127 | Max: 7.528
[LOSS Ex2] A: 0.10465 | B: 0.32090 | C: 0.18697
** [JOINT LOSS] ** : 0.812557
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004944 | Grad Max: 0.206374
  -> Layer: shared_layers.0.bias | Grad Mean: 0.564062 | Grad Max: 2.790831
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.006463
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007453 | Grad Max: 0.007453
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003466 | Grad Max: 0.508147
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.064277 | Grad Max: 2.808617
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.010766
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.036592 | Grad Max: 0.185497
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000575
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007327 | Grad Max: 0.016097
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000441
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001880 | Grad Max: 0.006676
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000770 | Grad Max: 0.002800
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023972 | Grad Max: 0.023972
[GRADIENT NORM TOTAL] 12.2744

>>> [TRAIN] BATCH 3 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51225144 0.48774853] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 665/1383 | C: 743/1305
[LOSS Ex1] A: 0.62021 | B: 0.60415 | C: 0.60071
[LOGITS Ex2 A] Mean Abs: 2.100 | Max: 6.858
[LOSS Ex2] A: 0.10946 | B: 0.31718 | C: 0.22692
** [JOINT LOSS] ** : 0.826208
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.003831 | Grad Max: 0.207102
  -> Layer: shared_layers.0.bias | Grad Mean: 0.523785 | Grad Max: 2.536822
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.005386
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.004883 | Grad Max: 0.004883
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003145 | Grad Max: 0.386604
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058194 | Grad Max: 2.134387
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000325 | Grad Max: 0.011181
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035342 | Grad Max: 0.186598
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000484
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006967 | Grad Max: 0.013709
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000407
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001868 | Grad Max: 0.006336
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000835 | Grad Max: 0.002625
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.025905 | Grad Max: 0.025905
[GRADIENT NORM TOTAL] 10.9855

>>> [TRAIN] BATCH 4 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004027  0.49959725] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 674/1374 | C: 732/1316
[LOSS Ex1] A: 0.62925 | B: 0.59966 | C: 0.59559
[LOGITS Ex2 A] Mean Abs: 2.062 | Max: 5.513
[LOSS Ex2] A: 0.10482 | B: 0.29870 | C: 0.20505
** [JOINT LOSS] ** : 0.811026
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005890 | Grad Max: 0.164258
  -> Layer: shared_layers.0.bias | Grad Mean: 0.503021 | Grad Max: 2.246016
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.005647
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007750 | Grad Max: 0.007750
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003266 | Grad Max: 0.393191
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060258 | Grad Max: 2.186040
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000335 | Grad Max: 0.010196
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035632 | Grad Max: 0.176351
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000620
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007243 | Grad Max: 0.015093
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000432
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001874 | Grad Max: 0.006595
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000820 | Grad Max: 0.003027
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024537 | Grad Max: 0.024537
[GRADIENT NORM TOTAL] 10.7660

>>> [TRAIN] BATCH 5 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.54094696 0.45905304] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.563 | Std: 0.085
[MASKS] A(Pass/Fail): 689/1359 | B: 626/1230 | C: 727/1321
[LOSS Ex1] A: 0.63013 | B: 0.60360 | C: 0.60427
[LOGITS Ex2 A] Mean Abs: 2.074 | Max: 6.300
[LOSS Ex2] A: 0.10544 | B: 0.30236 | C: 0.23467
** [JOINT LOSS] ** : 0.826829
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.005563 | Grad Max: 0.181007
  -> Layer: shared_layers.0.bias | Grad Mean: 0.505638 | Grad Max: 2.351692
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.005500
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000525 | Grad Max: 0.000525
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003151 | Grad Max: 0.461855
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.058379 | Grad Max: 2.534984
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000330 | Grad Max: 0.011852
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.035496 | Grad Max: 0.204254
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000527
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007135 | Grad Max: 0.014628
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000449
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001902 | Grad Max: 0.006345
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000795 | Grad Max: 0.002543
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024851 | Grad Max: 0.024851
[GRADIENT NORM TOTAL] 10.7856

>>> [TRAIN] BATCH 6 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.8331859 0.1668141] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.570 | Std: 0.090
[MASKS] A(Pass/Fail): 745/1303 | B: 676/1372 | C: 699/1349
[LOSS Ex1] A: 0.62344 | B: 0.60390 | C: 0.60141
[LOGITS Ex2 A] Mean Abs: 2.120 | Max: 7.050
[LOSS Ex2] A: 0.08966 | B: 0.32070 | C: 0.21334
** [JOINT LOSS] ** : 0.817487
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004892 | Grad Max: 0.219203
  -> Layer: shared_layers.0.bias | Grad Mean: 0.576086 | Grad Max: 2.925282
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.005427
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007062 | Grad Max: 0.007062
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003501 | Grad Max: 0.485263
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.065402 | Grad Max: 2.698235
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000351 | Grad Max: 0.010652
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.038217 | Grad Max: 0.191218
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000602
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.007585 | Grad Max: 0.016663
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000481
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.002004 | Grad Max: 0.006927
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000881 | Grad Max: 0.003071
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.027119 | Grad Max: 0.027119
[GRADIENT NORM TOTAL] 12.4205

>>> [TRAIN] BATCH 7 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010807 0.4989193] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.089
[MASKS] A(Pass/Fail): 726/1322 | B: 665/1383 | C: 712/1336
[LOSS Ex1] A: 0.63079 | B: 0.60415 | C: 0.60024
[LOGITS Ex2 A] Mean Abs: 2.125 | Max: 6.172
[LOSS Ex2] A: 0.09164 | B: 0.31723 | C: 0.21131
** [JOINT LOSS] ** : 0.818454
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004494 | Grad Max: 0.175036
  -> Layer: shared_layers.0.bias | Grad Mean: 0.473940 | Grad Max: 2.295432
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.001995 | Grad Max: 0.004830
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000634 | Grad Max: 0.000634
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002939 | Grad Max: 0.399060
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053932 | Grad Max: 2.203146
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000289 | Grad Max: 0.010006
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031327 | Grad Max: 0.160608
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000483
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006300 | Grad Max: 0.013070
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000356
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001653 | Grad Max: 0.005524
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000719 | Grad Max: 0.002710
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022183 | Grad Max: 0.022183
[GRADIENT NORM TOTAL] 10.0767

>>> [TRAIN] BATCH 8 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.7557064  0.24429356] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.567 | Std: 0.090
[MASKS] A(Pass/Fail): 713/1335 | B: 674/1374 | C: 713/1335
[LOSS Ex1] A: 0.62568 | B: 0.59966 | C: 0.59937
[LOGITS Ex2 A] Mean Abs: 2.093 | Max: 6.156
[LOSS Ex2] A: 0.11825 | B: 0.29279 | C: 0.20571
** [JOINT LOSS] ** : 0.813824
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.006023 | Grad Max: 0.177968
  -> Layer: shared_layers.0.bias | Grad Mean: 0.500950 | Grad Max: 2.252007
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.005247
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.007556 | Grad Max: 0.007556
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003097 | Grad Max: 0.395739
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.057109 | Grad Max: 2.157021
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000318 | Grad Max: 0.009114
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033829 | Grad Max: 0.170937
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000570
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006807 | Grad Max: 0.014402
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000381
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001797 | Grad Max: 0.006181
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000790 | Grad Max: 0.002967
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.023809 | Grad Max: 0.023809
[GRADIENT NORM TOTAL] 10.4239

>>> [TRAIN] BATCH 9 START <<<
[DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482838 0.3517162] | Indices: [0 1] | Label Real: 0
[CONFIDENCE A] Mean: 0.569 | Std: 0.090
[MASKS] A(Pass/Fail): 588/1028 | B: 626/1230 | C: 741/1307
[LOSS Ex1] A: 0.62377 | B: 0.60360 | C: 0.59659
[LOGITS Ex2 A] Mean Abs: 2.148 | Max: 10.256
[LOSS Ex2] A: 0.10247 | B: 0.30266 | C: 0.19399
** [JOINT LOSS] ** : 0.807691
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004910 | Grad Max: 0.157883
  -> Layer: shared_layers.0.bias | Grad Mean: 0.466297 | Grad Max: 2.143766
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.005608
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000444 | Grad Max: 0.000444
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002949 | Grad Max: 0.367787
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053999 | Grad Max: 2.035715
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.008992
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.033028 | Grad Max: 0.171522
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000502
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006718 | Grad Max: 0.013898
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000431
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001747 | Grad Max: 0.006530
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000722 | Grad Max: 0.002861
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022394 | Grad Max: 0.022394
[GRADIENT NORM TOTAL] 9.8129

>>> [TRAIN] BATCH 10 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50850874 0.49149126] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.569 | Std: 0.091
[MASKS] A(Pass/Fail): 729/1319 | B: 676/1372 | C: 738/1310
[LOSS Ex1] A: 0.62457 | B: 0.60390 | C: 0.59436
[LOGITS Ex2 A] Mean Abs: 2.149 | Max: 10.342
[LOSS Ex2] A: 0.09548 | B: 0.32438 | C: 0.20165
** [JOINT LOSS] ** : 0.814785
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004997 | Grad Max: 0.190291
  -> Layer: shared_layers.0.bias | Grad Mean: 0.522649 | Grad Max: 2.510825
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.005315
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.000801 | Grad Max: 0.000801
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.003266 | Grad Max: 0.480216
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.060552 | Grad Max: 2.650548
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000319 | Grad Max: 0.009269
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034541 | Grad Max: 0.173877
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000517
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006964 | Grad Max: 0.013582
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000405
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001839 | Grad Max: 0.006038
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000779 | Grad Max: 0.002799
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024194 | Grad Max: 0.024194
[GRADIENT NORM TOTAL] 11.2860

>>> [TRAIN] BATCH 11 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.50378674 0.4962133 ] | Indices: [0 1] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.091
[MASKS] A(Pass/Fail): 717/1331 | B: 665/1383 | C: 697/1351
[LOSS Ex1] A: 0.62125 | B: 0.60415 | C: 0.60354
[LOGITS Ex2 A] Mean Abs: 2.126 | Max: 7.049
[LOSS Ex2] A: 0.09714 | B: 0.31972 | C: 0.22355
** [JOINT LOSS] ** : 0.823120
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004298 | Grad Max: 0.165555
  -> Layer: shared_layers.0.bias | Grad Mean: 0.472334 | Grad Max: 2.113328
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.005500
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002926 | Grad Max: 0.002926
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002889 | Grad Max: 0.377320
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.053513 | Grad Max: 2.095467
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000293 | Grad Max: 0.009286
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.031812 | Grad Max: 0.158892
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000462
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006425 | Grad Max: 0.012741
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000401
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001717 | Grad Max: 0.006114
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000799 | Grad Max: 0.002788
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024305 | Grad Max: 0.024305
[GRADIENT NORM TOTAL] 9.7884

>>> [TRAIN] BATCH 12 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.51225185 0.48774815] | Indices: [1 0] | Label Real: 1
[CONFIDENCE A] Mean: 0.568 | Std: 0.090
[MASKS] A(Pass/Fail): 724/1324 | B: 674/1374 | C: 747/1301
[LOSS Ex1] A: 0.62021 | B: 0.59966 | C: 0.59277
[LOGITS Ex2 A] Mean Abs: 2.090 | Max: 6.000
[LOSS Ex2] A: 0.11596 | B: 0.28734 | C: 0.19064
** [JOINT LOSS] ** : 0.802194
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004727 | Grad Max: 0.175514
  -> Layer: shared_layers.0.bias | Grad Mean: 0.460077 | Grad Max: 2.239958
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.006074
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.005551 | Grad Max: 0.005551
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002800 | Grad Max: 0.329440
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.051954 | Grad Max: 1.797398
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000303 | Grad Max: 0.011107
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.032540 | Grad Max: 0.190444
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000468
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006503 | Grad Max: 0.013475
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000404
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001715 | Grad Max: 0.006109
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000726 | Grad Max: 0.002940
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.022419 | Grad Max: 0.022419
[GRADIENT NORM TOTAL] 9.5748

>>> [TRAIN] BATCH 13 START <<<
[DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000
[LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073
[SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004028  0.49959713] | Indices: [1 0] | Label Real: 0
[CONFIDENCE A] Mean: 0.565 | Std: 0.089
[MASKS] A(Pass/Fail): 686/1362 | B: 626/1230 | C: 520/856
[LOSS Ex1] A: 0.62925 | B: 0.60360 | C: 0.59576
[LOGITS Ex2 A] Mean Abs: 2.088 | Max: 5.832
[LOSS Ex2] A: 0.10364 | B: 0.29695 | C: 0.20532
** [JOINT LOSS] ** : 0.811507
[GRADIENTS CHECK]
  -> Layer: shared_layers.0.weight | Grad Mean: 0.004850 | Grad Max: 0.162666
  -> Layer: shared_layers.0.bias | Grad Mean: 0.458330 | Grad Max: 2.079142
  -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.005857
  -> Layer: exit1_layers.0.bias | Grad Mean: 0.002294 | Grad Max: 0.002294
  -> Layer: exit2_layers.0.weight | Grad Mean: 0.002962 | Grad Max: 0.334687
  -> Layer: exit2_layers.0.bias | Grad Mean: 0.054633 | Grad Max: 1.851790
  -> Layer: exit2_layers.3.weight | Grad Mean: 0.000316 | Grad Max: 0.011321
  -> Layer: exit2_layers.3.bias | Grad Mean: 0.034012 | Grad Max: 0.192770
  -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000492
  -> Layer: exit2_layers.6.bias | Grad Mean: 0.006939 | Grad Max: 0.014465
  -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000426
  -> Layer: exit2_layers.9.bias | Grad Mean: 0.001832 | Grad Max: 0.006598
  -> Layer: exit2_layers.12.weight | Grad Mean: 0.000777 | Grad Max: 0.002914
  -> Layer: exit2_layers.12.bias | Grad Mean: 0.024053 | Grad Max: 0.024053
[GRADIENT NORM TOTAL] 9.7263

[EPOCH SUMMARY] Train Loss: 0.8148

[VALIDATION] Starting...
[VAL] Processando primeiro batch de validação...
[EPOCH END] Val Loss: 0.8012 | Alpha: 0.5500
No improve count: 15/15
EARLY STOPPING TRIGGERED
No description has been provided for this image
Modelo treinado e salvo em 'models/teste_ljoint9.pth'
In [46]:
model.load_state_dict(torch.load(f'models/{modelname}.pth'))
print(f"Modelo 'models/{modelname}.pth' carregado\n")

print(f"Base: UNSW")
results = evaluate_model(model, test_loaders[0], limiar, device=device)
print("-" * 20)
print(f"  Accuracy: {results['accuracy']:.4f}%")
print(f"  Avg. Inference Time: {results['avg_inference_time_ms']:.4f} ms")
print(f"  Early Exit Rate: {results['exit_rate']:.4f}% ({results['exited_early_count']}/{results['total_samples']})")
print("-" * 20)

print(f"\nBase: BOT")
results = evaluate_model(model, test_loaders[1], limiar, device=device)
print("-" * 20)
print(f"  Accuracy: {results['accuracy']:.4f}%")
print(f"  Avg. Inference Time: {results['avg_inference_time_ms']:.4f} ms")
print(f"  Early Exit Rate: {results['exit_rate']:.4f}% ({results['exited_early_count']}/{results['total_samples']})")
print("-" * 20)

print(f"\nBase: CIC")
results = evaluate_model(model, test_loaders[2], limiar, device=device)
print("-" * 20)
print(f"  Accuracy: {results['accuracy']:.4f}%")
print(f"  Avg. Inference Time: {results['avg_inference_time_ms']:.4f} ms")
print(f"  Early Exit Rate: {results['exit_rate']:.4f}% ({results['exited_early_count']}/{results['total_samples']})")
print("-" * 20)
Modelo 'models/teste_ljoint9.pth' carregado

Base: UNSW
No description has been provided for this image
True Positives (TP): 7440
True Negatives (TN): 8804
False Positives (FP): 196
False Negatives (FN): 1560

F1 Score: 0.8944
True Positive Rate (TPR) / Recall: 0.8267
True Negative Rate (TNR) / Specificity: 0.9782
--------------------
  Accuracy: 90.2444%
  Avg. Inference Time: 0.0025 ms
  Early Exit Rate: 35.0222% (6304/18000)
--------------------

Base: BOT
No description has been provided for this image
True Positives (TP): 2907
True Negatives (TN): 3636
False Positives (FP): 364
False Negatives (FN): 1093

F1 Score: 0.7996
True Positive Rate (TPR) / Recall: 0.7268
True Negative Rate (TNR) / Specificity: 0.9090
--------------------
  Accuracy: 81.7875%
  Avg. Inference Time: 0.0025 ms
  Early Exit Rate: 32.9500% (2636/8000)
--------------------

Base: CIC
No description has been provided for this image
True Positives (TP): 12647
True Negatives (TN): 13591
False Positives (FP): 409
False Negatives (FN): 1353

F1 Score: 0.9349
True Positive Rate (TPR) / Recall: 0.9034
True Negative Rate (TNR) / Specificity: 0.9708
--------------------
  Accuracy: 93.7071%
  Avg. Inference Time: 0.0023 ms
  Early Exit Rate: 35.5750% (9961/28000)
--------------------
In [ ]:
 
In [ ]: